Python-數據分析模塊

時間 2019-11-20

原文原文鏈接

目錄python

numpy 模塊

numpy 模塊主要用來作數據分析，對numpy數組進行科學運算sql

主要方法和經常使用屬性，都是用numpy 生成的對象.出來的json

import numpy as np數組

屬性	描述
T	數組的轉置，行和列一一對應，重構，每行2個元素
dtype	數組元素的數據類型（int32 和 float64）
size	數組元素的個數
ndim	數組的維數
shape	數組的維度大小（有幾行幾列）
astype	數據類型轉換

經常使用方法	描述
元素切分	[:,:] 表示行和列
邏輯取值	取出用numpy生成的數組對象 > 4的元素
賦值	取出用numpy生成的數組對象的索引值 = 0
數組橫向合併	行和行合併，列和列合併
數組垂直合併	至關於list update，直接添加元素

數組函數	描述
np.array()	將列表轉換爲數組，可選擇是否制定dtype
np.ones()	傳入行數和列數，值都爲1
np.zeros()	傳入行數和列數，值都爲0
np.eye()	輸入行數和列數，對角值爲1
np.arange()	和列表的range方法同樣，支持浮點數
np.linspace()	相似arange()，第三個參數爲數組長度
np.empty()	建立一個元素全隨機的數組
np.reshape()	重塑形狀
數組運算	與數組函數聯用 +-*/ 數字


生成隨機數（經常使用）	np.random.rand(x,y)
	np.random.random(x,y)
	np.random.choice(x,y)
	np.random.shuffle(x,y)

numpy數學統計方法	描述
sum	求和
cumsum	累加求和
mean	求平均數
std	求標準差
var	求方差
min	求最小值
max	求最大值
argmin	求最小值索引
argmax	求最大值索引
sort	排序

如下代碼具體解釋app

lt1 = [1,2,3]
lt2 = [4,5,6]

lt = []
# 若是咱們想要對這兩個列表內數據相乘，咱們能夠用for循環
for i in range(len(lt1)):
    lt.append(lt1[i] * lt2[i])
print(lt)


import numpy as np

# 利用numpy 進行矩陣計算 更方便
arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])
print(arr1 * arr2)
## [ 4 10 18]




# numpy 建立 numpy 數組   --》 可變的數據類型
# 一維數組  一般不使用，建立的數組沒有,
arr = np.array([1,2,3])
print(arr)
# [1 2 3]

# 二維數組
arr = np.array([
    [1,2,3],
    [4,5,6]
])
print(arr)
# [[1 2 3]
#  [4 5 6]]

# 三維數組      一般不使用
arr = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])
print(arr)
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]


# numpy 數組的屬性 特性
arr = np.array([
    [1,2,3],
    [4,5,6]
])

# T數組的轉置，行列互換
print(arr, "\n",arr.T)
# [[1 4]
#  [2 5]
# [3 6]]

# dtype 數組元素的數據類型，
# numpy數組是屬於python解釋器的，
# int32 float64 屬於numpy數組
print(arr.dtype)
# int32

# size 數組元素的個數
print(arr.size)
# 6

# ndim 數據的維數
print(arr.ndim)
# 2

# shape 數據的緯度大小（以元組形式）
print(arr.shape)
# (2, 3)

# astype 類型轉換 爲int32
arr = arr.astype(np.float64)
print(arr)
# [[1. 2. 3.]
#  [4. 5. 6.]]

# 切片numpy數組
arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

print(arr[:,:])     # ：行,：列
# [[1 2 3]
#  [4 5 6]]
print(arr[0,0])
# 1
print(arr[1,2])
# 6
print(arr[:,-2:])
# [[2 3]
#  [5 6]]

# 邏輯取值
print(arr[arr > 4])
# [[2 3]
#  [5 6]]
#  [5 6]

# 賦值
arr[0,0] = 0
print(arr)
# [[0 2 3]
#  [4 5 6]]

# 數組合並
arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [7, 8, 9],
    ['a', 'b', 'c']
])

# 橫向合併
print(np.hstack((arr1,arr2)))
# [['1' '2' '3' '7' '8' '9']
#  ['4' '5' '6' 'a' 'b' 'c']]

# 垂直合併
print(np.vstack((arr1,arr2)))
# [['1' '2' '3']
#  ['4' '5' '6']
#  ['7' '8' '9']
#  ['a' 'b' 'c']]

# 默認以列合併  #axis = 0    0表示列，1表示行
print(np.concatenate((arr1,arr2),axis=1))
# [['1' '2' '3' '7' '8' '9']
#  ['4' '5' '6' 'a' 'b' 'c']]

# 經過函數建立numpy數組

print(np.ones((2,3)))
# [[1. 1. 1.]
#  [1. 1. 1.]]

print(np.zeros((2,3)))
# [[0. 0. 0.]
#  [0. 0. 0.]]

print(np.eye(3,3))
# [0. 1. 0.]
# [0. 0. 1.]]

print(np.linspace(1,100,10))
# [  1.  12.  23.  34.  45.  56.  67.  78.  89. 100.]

print(np.arange(2,10))
# [2 3 4 5 6 7 8 9]

# 重構形狀
arr1 = np.zeros((2,6))      #
print(arr1.reshape((3,4)))      # 重構形狀必須相乘的 相等
# [[0. 0. 0. 0.]
#  [0. 0. 0. 0.]
#  [0. 0. 0. 0.]]


# numpy 數組運算
# +-*/
arr = np.ones((3,4)) * 4
print(arr)
# [[4. 4. 4. 4.]
#  [4. 4. 4. 4.]
#  [4. 4. 4. 4.]]

arr = np.ones((3,4)) + 4
print(arr)
# [[5. 5. 5. 5.]
#  [5. 5. 5. 5.]
#  [5. 5. 5. 5.]]

# numpy 數組運算函數      瞭解——————-
print(np.sin(arr))
# [[-0.95892427 -0.95892427 -0.95892427 -0.95892427]
#  [-0.95892427 -0.95892427 -0.95892427 -0.95892427]
#  [-0.95892427 -0.95892427 -0.95892427 -0.95892427]]

# 矩陣運算 --  點乘
arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [1, 2],
    [4, 5],
    [6, 7]
])
print(np.dot(arr1,arr2))
# [[27 33]
#  [60 75]]

# 求逆
arr = np.array([[1, 2, 3], [4, 5, 6], [9, 8, 9]])
print(np.linalg.inv(arr))
# [[ 0.5        -1.          0.5       ]
#  [-3.          3.         -1.        ]
#  [ 2.16666667 -1.66666667  0.5       ]]





# numpy 數組數學和統計方法

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
print(np.sum(arr[:,:]))
# 21

# 生成隨機數
print(np.random.rand(3,4))
# [[0.76654824 0.23510842 0.79989748 0.93094884]
#  [0.97155472 0.29956374 0.27754847 0.91103403]
#  [0.43714323 0.7549109  0.14547903 0.20511579]]

print(np.random.random((3,4)))
# [[0.91673193 0.15218486 0.32976182 0.41812734]
#  [0.33360061 0.20190749 0.48689467 0.46679115]
#  [0.12490532 0.50441629 0.95525997 0.5402791 ]]


# 針對一維 隨機選擇數字
print(np.random.choice([1,2,3],1))
# [1]

# 追對某一範圍
print(np.random.randint(1,100,(3,4)))
# [[33 40 93 18]
#  [80 65 64 51]
#  [66  6 83 10]]

matplotlib 模塊

matplotlib 模塊就是用來畫圖的dom

# 條形圖

from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

# 設置字體，否則畫出來會亂碼
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc")

# 設置背景
plt.style.use("ggplot")

# 定義 行 列 信息
clas = ["3班","4班","5班","6班"]
students = [50,55,45,60]
clas_index = range(len(clas))

# 開始畫
plt.bar(clas_index,students,color="darkblue")

plt.xlabel("學生",FontProperties=font)
plt.xlabel("學生人數",FontProperties=font)
plt.title("班級-學生人數",FontProperties=font,Fontsize=25,fontweight=20)
plt.xticks(clas_index,clas,FontProperties=font)

# 展現
plt.show()

# 直方圖
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

# 設置字體，否則畫出來會亂碼
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc")
plt.style.use("ggplot")

# 生成隨機數對象
x1 = np.random.randn(10000)
x2 = np.random.randn(10000)

# 生成畫布
fig = plt.figure()

# 每行每列
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

ax1.hist(x1,bins=50,color="darkblue")
ax2.hist(x2,bins=50,color="y")

fig.suptitle("兩個正太分佈",FontProperties=font,fontsize=20)
ax1.set_title("x1的正態分佈",FontProperties=font)
ax2.set_title("x2的正態分佈",FontProperties=font)

# 展現
plt.show()

# 折線圖

import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

# 設置字體，否則畫出來會亂碼
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc")
plt.style.use("ggplot")

np.random.seed(10)

x1 = np.random.randn(40).cumsum()
x2 = np.random.randn(40).cumsum()
x3 = np.random.randn(40).cumsum()
x4 = np.random.randn(40).cumsum()

plt.plot(x1,color="r",linestyle="-",marker="o",label="紅圓線")
plt.plot(x2,color="y",linestyle="--",marker="*",label="黃虛線")
plt.plot(x3,color="b",linestyle="-.",marker="s",label="藍方線")
plt.plot(x4,color="black",linestyle=":",marker="s",label="黑方線")
plt.legend(loc="best",prop=font)

# 展現
plt.show()

# 散點圖 + 直線圖
import numpy as np
from matplotlib import pyplot as plt  # 約定俗成
from matplotlib.font_manager import FontProperties  # 修改字體

# 設置字體，否則畫出來會亂碼
font = FontProperties(fname='C:\Windows\Fonts\simsun.ttc')
plt.style.use('ggplot')

fig = plt.figure()
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)


x = np.arange(20)
y = x ** 2
x2 = np.arange(20)
y2 = x2 ** 2

ax1.scatter(x,y,color="r",label="紅")
ax2.scatter(x2,y2,color="b",label="藍")

ax1.plot(x,y)
ax2.plot(x2,y2)

fig.suptitle("兩張圖",FontProperties=font,fontsize=15)
ax1.set_title("散點圖",FontProperties=font)
ax2.set_title("折線圖",FontProperties=font)
ax1.legend(prop=font)

# 展現
plt.show()

pandas 模塊

pandas 模塊操做excel/json/sql/ini/csv文件的函數

import pandas as pd
import numpy as np

np.random.seed(10)

# 生成6個月份
index = pd.date_range("2019-01-01",periods=6,freq="M")
print(index)
columns = ["c1","c2","c3","c4"]

# 生成隨機數
val = np.random.randn(6,4)


df = pd.DataFrame(index=index,columns=columns,data=val)
print(df)
#                   c1        c2        c3        c4
# 2019-01-31  1.331587  0.715279 -1.545400 -0.008384
# 2019-02-28  0.621336 -0.720086  0.265512  0.108549
# 2019-03-31  0.004291 -0.174600  0.433026  1.203037
# 2019-04-30 -0.965066  1.028274  0.228630  0.445138
# 2019-05-31 -1.136602  0.135137  1.484537 -1.079805
# 2019-06-30 -1.977728 -1.743372  0.266070  2.384967


# 保存成 xlsx 文件
df.to_excel("date_c.xlsx")
# 讀出文件
df = pd.read_excel("date_c.xlsx",index_col=[0])
print(df)
#                   c1        c2        c3        c4
# 2019-01-31  1.331587  0.715279 -1.545400 -0.008384
# 2019-02-28  0.621336 -0.720086  0.265512  0.108549
# 2019-03-31  0.004291 -0.174600  0.433026  1.203037
# 2019-04-30 -0.965066  1.028274  0.228630  0.445138
# 2019-05-31 -1.136602  0.135137  1.484537 -1.079805
# 2019-06-30 -1.977728 -1.743372  0.266070  2.384967



###############
print(df.index)
print(df.columns)
print(df.values)

print(df[['c1', 'c2']])

# 按照index取值
# print(df['2019-01-31'])
print(df.loc['2019-01-31'])
print(df.loc['2019-01-31':'2019-05-31'])

# 按照values取值
print(df)
print(df.iloc[0, 0])

df.iloc[0, :] = 0
print(df)