數據分析學習總結

一、matplotlib學習概括python

""" test 1 簡單畫圖"""
# from matplotlib import pyplot as plt # # fig = plt.figure(figsize=(20, 8), dpi=80) # figure圖形圖標的意思，這裏是指咱們畫的圖，figsize表示圖像大小，dpi表示圖片清晰度 # x = range(2, 26, 2) # 數據在x軸的位置，是一個可迭代對象 # # y = [15, 13, 14.5, 17, 20, 25, 26, 26, 27, 22, 18, 15] # 數據在y軸的位置，是一個可迭代對象 # plt.plot(x, y) # 傳入x和y，經過plot繪製出折線圖 # # plt.xticks(x) # 設置x軸的刻度 # # plt.xticks(x[::2]) # 當刻度太密集的時候使用列表步長來解決，matplotlib自動幫咱們對應 # # # plt.savefig('./pic.png') # 保存圖片 # plt.show() # 在執行程序的是否展現圖形

""" test 2 自定義x軸標籤，中文亂碼解決，添加圖例等"""
# # 使用 fc-list :lang=zh 查看中文字體 # # 指定字體的路徑，而後在使用的時候使用fontproperties指定字體 # # from matplotlib import pyplot as plt # from matplotlib import font_manager # import random # # my_font = font_manager.FontProperties(fname='/home/felix/.local/share/fonts/SIMHEI.TTF') # 字體的路徑 # # plt.figure(figsize=(20, 8), dpi=80) # x = range(120) # # random.seed(10) # 設置隨機種子，讓不一樣時候隨機獲得的結果都相同 # y = [random.uniform(20, 35) for i in range(120)] # 隨機產生數據 # y2 = [random.uniform(20, 35) for i in range(120)] # 隨機產生數據 # # plt.plot(x, y, label='10點到12點天氣變化') # 若是要添加圖例則使用label指定圖例名稱 # plt.plot(x, y2, label='13點到14點天氣變化') # plt.legend(prop=my_font, loc='best') # 圖例經過prop指定字體，經過loc指定位置 # # _x_ticks = ['10點{}分'.format(i) if i < 60 else '11點{}分'.format(i - 60) for i in x] # 產生自定義的x軸數組 # # # 將第一個參數的x的位置用第二個參數的數組一一對應顯示，rotation表示旋轉，fontproperties表示指定字體 # plt.xticks(x[::5], _x_ticks[::5], rotation=90, fontproperties=my_font) # # 添加描述信息 # plt.xlabel('時間', fontproperties=my_font) # 設置x軸的標籤 # plt.ylabel('溫度(℃)', fontproperties=my_font) # 設置y軸的標籤 # plt.title('10點到12點每分鐘的時間變化狀況', fontproperties=my_font) # 設置title # # plt.grid(alpha=0.4,linestyle=':') # 繪製網格 # plt.show()

""" 經常使用統計圖對比 """
# 折線圖:以折線的上升或降低來表示統計數量的增減變化的統計圖 # 特色:可以顯示數據的變化趨勢，反映事物的變化狀況。(變化) # # 直方圖:由一系列高度不等的縱向條紋或線段表示數據分佈的狀況。 # 通常用橫軸表示數據範圍，縱軸表示分佈狀況。 # 特色:繪製連續性的數據,展現一組或者多組數據的分佈情況(統計) # # 條形圖:排列在工做表的列或行中的數據能夠繪製到條形圖中。 # 特色:繪製連離散的數據,可以一眼看出各個數據的大小,比較數據之間的差異。(統計) # # 散點圖:用兩組數據構成多個座標點，考察座標點的分佈,判斷兩變量 # 之間是否存在某種關聯或總結座標點的分佈模式。 # 特色:判斷變量之間是否存在數量關聯趨勢,展現離羣點(分佈規律)

""" test 3 散點圖 """

# import random # from matplotlib import ( # pyplot as plt, # font_manager, # ) # # # 設置字體 # my_font = font_manager.FontProperties(fname='/home/felix/.local/share/fonts/SIMHEI.TTF') # # y_3 = [random.randint(8, 15) for i in range(31)] # y_10 = [random.randint(10, 25) for j in range(31)] # # x_3 = range(1, 32) # x_10 = range(51, 82) # # # 設置圖形大小 # plt.figure(figsize=(20, 8), dpi=80) # # # 使用scatter繪製散點圖 # plt.scatter(x_3, y_3, label="3月份") # plt.scatter(x_10, y_10, label='10月份') # # # 調整x軸的刻度 # _x = list(x_3) + list(x_10) # _xtick_labels = ['3月{}日'.format(i) for i in x_3] # _xtick_labels += ['10月{}日'.format(i - 50) for i in x_10] # # plt.xticks(_x[::3], _xtick_labels[::3], fontproperties=my_font, rotation=45) # # # 添加圖例 # plt.legend(prop=my_font, loc='upper left') # # # 添加描述信息 # plt.xlabel('時間', fontproperties=my_font) # plt.ylabel('溫度 單位(t)', fontproperties=my_font) # plt.title('3月份和10月份的溫度變化散點圖', fontproperties=my_font) # # plt.show()


""" test 4 條形圖1 """

# import random # from matplotlib import ( # pyplot as plt, # font_manager, # ) # # # 設置字體 # my_font = font_manager.FontProperties(fname='/home/felix/.local/share/fonts/SIMHEI.TTF') # # a = ['電影{}'.format(i) for i in range(30)] # 電影名稱 # b = [random.randint(10, 50) for i in range(30)] # 電影票房 # # # 設置圖形大小 # plt.figure(figsize=(20, 8), dpi=80) # # 繪製條形圖 # plt.bar(range(len(a)), b, width=0.3) # # 設置字符串到x軸 # plt.xticks(range(len(a)), a, fontproperties=my_font, rotation=45) # x軸對應 # plt.savefig('電影統計.png') # plt.show()

""" test 5 條形圖2 """

# 繪製橫着的條形圖 # import random # from matplotlib import ( # pyplot as plt, # font_manager, # ) # # # 設置字體 # my_font = font_manager.FontProperties(fname='/home/felix/.local/share/fonts/SIMHEI.TTF') # # a = ['電影{}'.format(i) for i in range(30)] # 電影名稱 # b = [random.randint(10, 50) for i in range(30)] # 電影票房 # # # 設置圖形大小 # plt.figure(figsize=(8, 20), dpi=80) # # 繪製橫着的條形圖 # plt.barh(range(len(a)), b, height=0.3, color='orange') # # 設置字符串到x軸 # plt.yticks(range(len(a)), a, fontproperties=my_font) # x軸對應 # # 添加網格 # plt.grid(alpha=0.3) # # plt.savefig('電影統計.png') # plt.show()

""" test 6 條形圖3 """
# from matplotlib import ( # pyplot as plt, # font_manager, # ) # # # 設置字體 # my_font = font_manager.FontProperties(fname='/home/felix/.local/share/fonts/SIMHEI.TTF') # # a = ['猩球崛起', '敦刻爾克', '蜘蛛俠', '戰狼2'] # b_16 = [15746, 312, 4497, 319] # b_15 = [12357, 156, 2045, 168] # b_14 = [2358, 399, 2357, 362] # # bar_width = 0.1 # x_14 = list(range(len(a))) # x_15 = [i + bar_width for i in x_14] # x_16 = [i + bar_width for i in x_15] # # plt.figure(dpi=80) # # plt.barh(x_14, b_14, color='red', height=bar_width, label='9月14日') # plt.barh(x_15, b_15, color='yellow', height=bar_width, label='9月15日') # plt.barh(x_16, b_16, color='blue', height=bar_width, label='9月16日') # # plt.yticks(x_15, a, fontproperties=my_font) # plt.legend(prop=my_font) # 標籤 # plt.show()

""" test 7 直方圖1 """

# import random # from matplotlib import ( # pyplot as plt, # font_manager, # ) # # # 通常來講可以使用直方圖的都是那些沒有統計過的數據 # # # 設置字體 # my_font = font_manager.FontProperties(fname='/home/felix/.local/share/fonts/SIMHEI.TTF') # # a = [random.randint(10, 100) for i in range(100)] # # 直方圖組數計算方式：將數據分組，當數據在100個之內時，按數據多少常分5-12組 # # 組距：指每一個小組的兩個端點的距離 # # 組數：極差/組距 # d = 5 # 組距 # num_bins = (max(a) - min(a)) // d # 注意組距必定要被numbins整除，不然會不均勻 # print(num_bins) # # plt.hist(a, num_bins) # # # 設置x軸的刻度 # plt.xticks(range(min(a), max(a) + d, d)) # # 顯示網格 # plt.grid() # plt.show()


""" test 8 直方圖2 """
# import random # from matplotlib import ( # pyplot as plt, # font_manager, # ) # # # 直方圖應用場景 # # 一、用戶的年齡分佈狀態 # # 二、一段時間內用戶的點擊次數的分佈狀態 # # 三、用戶活躍時間分佈狀態 # # # 繪製不一樣刻度的x軸的直方圖 # # interval = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 60, 90] # width = [5, 5, 5, 5, 5, 5, 5, 5, 5, 15, 30, 60] # quantity = [random.randint(500, 9000) for i in range(len(interval))] # # plt.figure(figsize=(20, 8), dpi=80) # # plt.bar(range(len(quantity)), quantity, width=1) # # # 設置x軸的刻度 # _x = [i - 0.5 for i in range(len(quantity) + 1)] # _xtick_labels = interval + [150] # plt.xticks(_x, _xtick_labels) # plt.grid() # # plt.show() #

二、numpy學習總結正則表達式

# -*- coding: utf-8 -*- # @Time : 18-12-26 下午7:01 # @Author : Felix Wang

""" test1 numpy入門和簡單使用"""
# # # numpy簡介 # # 一個在Python中作科學計算的基礎庫，重在數值計算，也是大部分PYTHON科學計算庫的基礎庫，多用於在大型、多維數組上執行數值運算 # # import numpy as np # # # 建立數組 # # 下面的a，b，c效果相同 # a = np.array([1, 2, 3, 4, 5]) # b = np.array(range(1, 6)) # c = np.arange(1, 6) # arange用法： arange([start,] stop[,step,],dtype=None) # # # 數組的類名： # a = np.array([1, 2, 3, 4, 5, 6]) # print(type(a)) # <class 'numpy.ndarray'> # print(a.dtype) # int64 # # # 指定建立的數組的數據類型 # a = np.array([1, 0, 1, 0], dtype=np.bool) # print(a) # [ True False True False] # # # 修改數組的數據類型 # a = a.astype(np.int8) # print(a) # [1 0 1 0] # # # 修改浮點型的小數位數 # b = np.round(a, 2) # 將浮點數保留兩位 # print(b) # # print('#' * 15) # # c = np.array([[3, 4, 5, 6, 7, 8], [4, 5, 6, 7, 8, 9]]) # print(c) # # # 查看數組的形狀 # print(c.shape) # (2, 6) # d = c.reshape(3, 4) # # [[3 4 5 6] # # [7 8 4 5] # # [6 7 8 9]] # print(d) # print(d.shape) # (3, 4) # # # 把數組轉化爲1維數組 # e = d.flatten() # print(e) # [3 4 5 6 7 8 4 5 6 7 8 9] # # 注意 下面這個不是轉換爲一維數組 # ee = d.reshape(1, 12) # print(ee) # [[3 4 5 6 7 8 4 5 6 7 8 9]] # # # 加減乘除法 # # 注意：加減乘除在運算過程當中，值被做用到全部的元素上 # print('#' * 15) # print(d) # # [[3 4 5 6] # # [7 8 4 5] # # [6 7 8 9]] # print(d + 1) # # [[ 4 5 6 7] # # [ 8 9 5 6] # # [ 7 8 9 10]] # print(d * 2) # # [[ 6 8 10 12] # # [14 16 8 10] # # [12 14 16 18]] # print('#' * 15) # # ####################################### # a = np.array([[3, 4, 5, 6, 7, 8], [4, 5, 6, 7, 8, 9]]) # b = np.array([[21, 22, 23, 24, 25, 26], [27, 28, 29, 30, 31, 32]]) # # 數組和數組的加減法,對應的各個值相加減，或者相乘除 # print(a + b) # # [[24 26 28 30 32 34] # # [31 33 35 37 39 41]] # print(a * b) # # [[ 63 88 115 144 175 208] # # [108 140 174 210 248 288]] # # # c = a.reshape(3, 4) # 注意不一樣的維度的不能相乘除 # # print(a * c) # # 可是： # # 2行6列的數組,和1行6列的數組 # a = np.array([[3, 4, 5, 6, 7, 8], [4, 5, 6, 7, 8, 9]]) # c = np.array([1, 2, 3, 4, 5, 6]) # print(a-c) # # [[2 2 2 2 2 2] # # [3 3 3 3 3 3]] # print(a*c) # # [[ 3 8 15 24 35 48] # # [ 4 10 18 28 40 54]] # # # 2行6列的數組,和2行一列的數組 # a = np.array([[3, 4, 5, 6, 7, 8], [4, 5, 6, 7, 8, 9]]) # c=np.array([[1],[2]]) # print(c+a) # # [[ 4 5 6 7 8 9] # # [ 6 7 8 9 10 11]] # print(c*a) # # [[ 3 4 5 6 7 8] # # [ 8 10 12 14 16 18]] # # 形成上面這種不一樣維度能計算的緣由是： # # 若是兩個數組的後緣長度，即從末尾開始算起的維度的軸長度相符或其中一方的長度爲1，則認爲他們是廣播兼容的。廣播會在缺失和長度爲1的維度上進行 # 
""" test2 numpy讀取文件 """

# import numpy as np # # # numpy讀取數據方法 # # np.loadtxt(fname,dtype=np.float,delimiter=None,skiprows=0,usecols=None,unpack=False) # # frame: 文件、字符串或產生器，能夠是.gz或bz2壓縮文件 # # dtype： 數據類型，可選，CSV的字符串以什麼數據類型讀入數組中，默認np.float # # delimiter: 分隔字符串，默認是任何空格，改成逗號 # # skiprows： 跳過前x行，通常跳過第一行表頭 # # usecols： 讀取指定的列，索引，元組類型 # # unpack： 若是爲True，讀入屬性將分爲寫入不一樣數組變量，False讀入數據只寫入一個數組變量，默認False # # # us_file_path = "US_video_data_numbers.csv" # uk_file_path = "GB_video_data_numbers.csv" # # # t1 = np.loadtxt(us_file_path,delimiter=",",dtype="int",unpack=True) # t2 = np.loadtxt(us_file_path, delimiter=",", dtype="int") # # # print(t1) # print(t2) # # print("*" * 100) # # # 取行 # # print(t2[2]) # # # 取連續的多行 # # print(t2[2:]) # # # 取不連續的多行 # # print(t2[[2,8,10]]) # # # print(t2[1,:]) # # print(t2[2:,:]) # # print(t2[[2,10,3],:]) # # # 取列 # # print(t2[:,0]) # # # 取連續的多列 # # print(t2[:,2:]) # # # 取不連續的多列 # # print(t2[:,[0,2]]) # # # 去行和列，取第3行，第四列的值 # # a = t2[2,3] # # print(a) # # print(type(a)) # # # 取多行和多列，取第3行到第五行，第2列到第4列的結果 # # 去的是行和列交叉點的位置 # b = t2[2:5, 1:4] # # print(b) # # # 取多個不相鄰的點 # # 選出來的結果是（0，0） （2，1） （2，3） # c = t2[[0, 2, 2], [0, 1, 3]] # print(c)


""" test3 numpy中的其餘操做"""
# import numpy as np # # # 求轉置的三種方法 # b = np.array([[1, 2, 3, 4, 5, 6], [4, 5, 6, 7, 8, 9]]) # print(b) # print(b.T) # print(b.swapaxes(1, 0)) # print(b.transpose()) # # # numpy中的三目運算符 # c = np.where(b < 5, 10, 0) # 若是b中的數字小於5，則至10，不然置0 # print(c) # # # 小於x替換爲x，大於y替換爲y # bb = np.array( # [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11], [12, 13, 14, 15, 16, 17], [18, 19, 20, np.nan, np.nan, np.nan]]) # print(bb) # cc = bb.clip(10, 18) # print(cc)

""" test4 numpy中的nan和inf """
# # nan： 表示不是一個數字 # # 當咱們讀取本地文件爲float的時候，若是有缺失，就會出現nan # # 或者作了一個不合適的計算的時候。 # # # inf：表示正無窮，-inf表示負無窮 # # 好比：一個數字除以0（python中直接會報錯，numpy中是一個inf或者-inf） # import numpy as np # # a = np.nan # b = np.inf # print(a, type(a)) # nan <class 'float'> # print(b, type(b)) # inf <class 'float'> # # # numpy中的nan的注意點 # # 一、兩個nan是不相等的 # # 二、np.nan!=np.nan # # 三、利用2的特性判斷數組中nan的個數 # np.count_nonzero(t!=t) # # 四、使用np.isnan(a)來判斷一個數組中是不是nan，好比但願把nan替換爲0，t[np.isnan(t)]=0 # # 五、nan和任何值計算都爲nan # # # 在一組數據中若是單純的把nan替換爲0，是不合適的，好比所有替換爲0後，替換以前的平均值若是大於0，替換以後的均值確定會變小，因此更通常的方式是把缺失的數字替換爲均值(中值) # # 或者是直接刪除有缺失值的一行 # """ 將nan替換爲均值的方法 # def fill_ndarray(t1): # for i in range(t1.shape[1]): # 遍歷每一列 # temp_col = t1[:, i] # 當前的一列 # nan_num = np.count_nonzero(temp_col != temp_col) # if nan_num != 0: # 不爲0，說明當前這一列中有nan # temp_not_nan_col = temp_col[temp_col == temp_col] # 當前一列不爲nan的array # # # 選中當前爲nan的位置，把值賦值爲不爲nan的均值 # temp_col[np.isnan(temp_col)] = temp_not_nan_col.mean() # return t1 # # # if __name__ == '__main__': # t1 = np.arange(24).reshape((4, 6)).astype("float") # t1[1, 2:] = np.nan # print(t1) # t1 = fill_ndarray(t1) # print(t1) # """ # # # numpy中經常使用統計函數 # # 求和：t.sum(axis=None) # # 均值：t.mean(a,axis=None) 受離羣點的影響較大 # # 中值：np.median(t,axis=None) # # 最大值：t.max(axis=None) # # 最小值：t.min(axis=None) # # 極值：np.ptp(t,axis=None) 即最大值和最小值之差 # # 標準差：t.std(axis=None) # # # # 數組的拼接，注意：豎直拼接的時候，每一列表明的意義要相同，不然牛頭不對馬嘴 # t1 = np.array([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]) # t2 = np.array([[12, 13, 14, 15, 16, 17], [18, 19, 20, 21, 22, 23]]) # tt1 = np.vstack((t1, t2)) # 豎直拼接 # tt2 = np.hstack((t1, t2)) # print(tt1) # # [[ 0 1 2 3 4 5] # # [ 6 7 8 9 10 11] # # [12 13 14 15 16 17] # # [18 19 20 21 22 23]] # print(tt2) # # [[ 0 1 2 3 4 5 12 13 14 15 16 17] # # [ 6 7 8 9 10 11 18 19 20 21 22 23]] # # # 數組的行列交換 # print(tt1) # tt1[[1, 2], :] = tt1[[2, 1], :] # 行交換 # print(tt1) # tt1[:, [0, 2]] = tt1[:, [2, 0]] # 列交換 # print(tt1)


""" test5 更多其餘方法 """
# import numpy as np # # # 一、得到最大值最小值的位置 # # # np.argmax(t,axis=0) # # # np.argmin(t,axis=1) # # 二、建立全爲0的數組： np.zeros((3,4)) # # 三、建立全爲1的數組： np.ones((3,4)) # # 四、建立一個對角線爲1的正方形數組(方陣)：np.eye(3) # # # numpy生成隨機數 # # print(np.random.rand(4,2,3)) # 建立均勻分佈的的隨機數數組，浮點數，範圍：0-1 # # print(np.random.rand(1,2)) # # print(np.random.randn(2, 2)) # 建立標準正態分佈隨機數，浮點數，平均數0，標準差1 # # print(np.random.randint(1, 10, (3, 4))) # 從給定上下限範圍選取隨機整數,最後一個參數爲形狀 # # print(np.random.uniform(1, 10, (3, 4))) # 產生具備均勻分佈的數組，第一個參數爲起始值，第二個參數爲結束值，第三個參數爲形狀 # # print(np.random.normal(1, 1, (3, 4))) # 從指定正態分佈中隨機抽取樣本，分佈中心是第一個參數（均值），標準差爲第二個值，第三個參數爲形狀 # # print(np.random.seed(10)) # 隨機數中值，能夠經過設置相同的種子，使沒吃生成的隨機數相同

三、pandas總結sql

# -*- coding: utf-8 -*- # @Time : 18-12-27 下午2:51 # @Author : Felix Wang

""" test1 pandas簡單操做 """
# # pandas 經常使用數據 # # 一、 Series 一維，帶標籤數組 # # 二、 DataFrame 二維，Series容器 # # import numpy as np # import pandas as pd # import string # # # 建立Series方式一： # t = pd.Series(np.arange(10), index=list(string.ascii_uppercase[:10])) # print(t) # print(type(t)) # <class 'pandas.core.series.Series'> # # # 建立Series方式二 # a = {string.ascii_uppercase[i]:i for i in range(10)} # b = pd.Series(a) # print(a) # print(b) # c = pd.Series(a,index=list(string.ascii_uppercase[5:15])) # # 注意：從新給其指定其餘的索引以後，若是可以對應上，就取其值，若是不能就爲nan，由於nan爲float類型，pandas會自動改變dtype # print(c) # # # pandas的索引和切片 # print(c[2:10:2]) # print(c[1]) # print(c[c>4]) # # print(b['A']) # print(b[[2,3,6]]) # # # Series的索引和值 # print(b.index) # print(b.values) # # 注：Series對象的本質上是由兩個數組構成 # # 一個數組構成對象的鍵(index,索引)，一個數組構成對象的值(values)。鍵->值。

""" test2 pandas讀取數據 """
# import pandas as pd # # #pandas讀取csv中的文件 # df = pd.read_csv("./dogNames2.csv") # df2 = pd.read_sql() # df3 = pd.read_excel() # df4 = pd.read_json()

""" test3 DataFrame對象 """
# import string # import pandas as pd # import numpy as np # # t = pd.DataFrame(np.arange(12).reshape((3, 4))) # # 0 1 2 3 # # 0 0 1 2 3 # # 1 4 5 6 7 # # 2 8 9 10 11 # print(t) # # # DataFrame對象既有行索引，又有列索引 # # 行索引，代表不一樣行，橫向索引，叫index，0軸，axis=0 # # 列索引，表名不一樣列，縱向索引，叫columns，1軸，axis=1 # # t2 = pd.DataFrame(np.arange(12).reshape((3, 4)), index=list(string.ascii_uppercase[:3]), # columns=list(string.ascii_uppercase[-4:])) # # W X Y Z # # A 0 1 2 3 # # B 4 5 6 7 # # C 8 9 10 11 # print(t2) # # # DataFrame的基礎屬性 # # df.shape # 行數，列數 # # df.dtypes # 列數據類型 # # df.ndim # 數據維度 # # df.index # 行索引 # # df.columns # 列索引 # # df.values # 對象值，二維ndarray數組 # # # # DataFrame總體狀況查詢 # # df.head(3) # 顯示頭部幾行，默認5行 # # df.tail(3) # 顯示末尾幾行，默認5行 # # df.info() # 相關信息概覽：行數，列數，列索引，列非空值個數，列類型，內存佔用 # # df.describe() # 快速綜合統計結果：計數，均值，標準差，最大值，四分位數，最小值 # # # 排序 # ## 按值排序 # # df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last') # # #### 參數說明 # # axis:{0 or ‘index’, 1 or ‘columns’}, default 0，默認按照索引排序，即縱向排序，若是爲1，則是橫向排序 # # by:str or list of str；若是axis=0，那麼by="列名"；若是axis=1，那麼by="行名"； # # ascending:布爾型，True則升序，能夠是[True,False]，即第一字段升序，第二個降序 # # inplace:布爾型，是否用排序後的數據框替換現有的數據框 # # kind:排序方法，{‘quicksort’, ‘mergesort’, ‘heapsort’}, default ‘quicksort’。彷佛不用太關心 # # na_position : {‘first’, ‘last’}, default ‘last’，默認缺失值排在最後面 # # # # # pandas 之loc和iloc # # # 一、df.loc 經過標籤索引行數據 # # # 二、df.iloc 經過位置獲取行數據 # # print('*' * 30) # # print(t2) # # # # print(t2.loc['A', 'W']) # 0 # # print(t2.loc[['A', 'C'], ['W', 'Z']]) # 選擇間隔的多行多列 # # print(t2.loc['A':'C', ['W', 'Z']]) # 冒號是閉合的能夠渠道冒號後面的值 # # # # print(t2.iloc[1, 2]) # 6 # # print(t2.iloc[1:3, 0:3]) # # # # # 賦值更改數據 # # print('*'*50) # # print(t2) # # t2.loc['A','W']=100 # # print(t2) # # t2.iloc[1:2,0:3]=0 # # print(t2) # # # # # pandas的布爾索引 # # print(t2[t2['W']==0]) # # # # pandas之字符串方法概括 # # cat ： 實現元素級的字符串鏈接操做，可指定分隔符 # # contains ： 返回表示各字符串是否含有指定模式的布爾型數組 # # count ： 模式的出現次數 # # endswith,startswith ： 至關於對各個元素執行x.endswith()或x.startswith() # # findall ： 計算各字符串的模式列表 # # get ： 獲取個元素的第i個字符 # # join ： 根據指定的分隔符將Series中各元素的字符串鏈接起來 # # len ： 計算各字符串的長度 # # lower,upper ： 轉換大小寫，至關於對各個元素執行x.lower或者x.upper # # match ： 根據指定的正則表達式對各個元素執行re.match # # pad : 在字符串的左邊、右邊或者左右兩邊添加空白 # # center ： 至關於pad(side='both') # # repeat : 重複值。例如，s.str.repeat(3)至關於各個字符串執行x*3 # # replace ： 用指定字符串替換找到的模式 # # slice ： 對Series中的各個字符串進行子串截取 # # split ： 根據分隔符或者正則表達式對字符串進行拆分 # # strip,lstrip,rstrip ： 去除空白，包括換行符 # # # 缺失數據的處理 # # 一、判斷數據是否爲NaN： pd.isnull(df),pd.notnull(df) # # 二、處理方式 # # (1)、刪除NaN所在的行列 t.dropna(axis=0,how='any',inplace=False) # # (2)、填充數據，t.fillna(t.mean()),t.fillna(t.median()),t.fillna(0) # # # 數據合併之join # t3 = pd.DataFrame(np.arange(12).reshape((3, 4)), index=list(string.ascii_uppercase[:3]), # columns=list(string.ascii_uppercase[-4:])) # t4 = pd.DataFrame(np.arange(12).reshape((2, 6)), index=list(string.ascii_uppercase[:2]), # columns=list(string.ascii_uppercase[-10:-4])) # # print(t3) # print(t4) # print(t3.join(t4)) # # W X Y Z Q R S T U V # # A 0 1 2 3 0.0 1.0 2.0 3.0 4.0 5.0 # # B 4 5 6 7 6.0 7.0 8.0 9.0 10.0 11.0 # # C 8 9 10 11 NaN NaN NaN NaN NaN NaN # # # # 數據合併值merge # # merge:按照指定的列把數據按照必定的方式合併到一塊兒 # print('*' * 30) # print(t3.merge(t4, left_on='Z', right_on='V')) # left_on=right_on# 就至關於前面這個和後面這個相等的地方進行合併,默認how爲inner # # W X Y Z Q R S T U V # # 0 8 9 10 11 6 7 8 9 10 11 # # print(t3.merge(t4, left_on='Z', right_on='V', how='outer')) # 交集，經過nan補全 # # W X Y Z Q R S T U V # # 0 0.0 1.0 2.0 3.0 NaN NaN NaN NaN NaN NaN # # 1 4.0 5.0 6.0 7.0 NaN NaN NaN NaN NaN NaN # # 2 8.0 9.0 10.0 11.0 6.0 7.0 8.0 9.0 10.0 11.0 # # 3 NaN NaN NaN NaN 0.0 1.0 2.0 3.0 4.0 5.0 # # print(t3.merge(t4, left_on='Z', right_on='V', how='left')) # 左邊爲準，經過nan補全 # # W X Y Z Q R S T U V # # 0 0 1 2 3 NaN NaN NaN NaN NaN NaN # # 1 4 5 6 7 NaN NaN NaN NaN NaN NaN # # 2 8 9 10 11 6.0 7.0 8.0 9.0 10.0 11.0 # print(t3.merge(t4, left_on='Z', right_on='V', how='right')) # 右邊爲準，經過nan補全 # # W X Y Z Q R S T U V # # 0 8.0 9.0 10.0 11.0 6 7 8 9 10 11 # # 1 NaN NaN NaN NaN 0 1 2 3 4 5 # # # # 分組和聚合 # # grouped = df.groupby(by="columns_name") # # grouped是一個DataFrameGroupBy對象，是可迭代的 # # grouped中的每個元素是一個元組 # # 元組裏面是（索引(分組的值)，分組以後的DataFrame） # # # DataFrameGroupBy對象有不少通過優化的方法 # # count : 很注重非NAN值的數量 # # sum ： 非NA值的和 # # mean ： 非NA值的平均值 # # median ： 非NA值的算數中位數 # # std,var : 無偏標準差和方差 # # min,max ： 非NA值的最小值和最大值

""" test4 pandas其餘知識點 """
import numpy as np import pandas as pd # 生成一段時間序列
print(pd.date_range(start='20180105', end='20180206', freq='D')) # DatetimeIndex(['2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08', # '2018-01-09', '2018-01-10', '2018-01-11', '2018-01-12', # '2018-01-13', '2018-01-14', '2018-01-15', '2018-01-16', # '2018-01-17', '2018-01-18', '2018-01-19', '2018-01-20', # '2018-01-21', '2018-01-22', '2018-01-23', '2018-01-24', # '2018-01-25', '2018-01-26', '2018-01-27', '2018-01-28', # '2018-01-29', '2018-01-30', '2018-01-31', '2018-02-01', # '2018-02-02', '2018-02-03', '2018-02-04', '2018-02-05', # '2018-02-06'], # dtype='datetime64[ns]', freq='D')

# 其中頻率的更多縮寫 # D 日曆日 # B 每工做日 # H 每小時 # T 每分鐘 # S 每秒 # L 每毫秒 # U 每微妙 # M 每個月最後一個日曆日 # BM 每個月最後一個工做日 # MS 每個月第一個日曆日 # BMS 每個月第一個工做日


# index = pd.date_range(start='20180102', periods=10) # periods表示週期 # # 在DataFrame中使用時間序列 # df = pd.DataFrame(index, range(10), columns=['w']) # # # t4 = pd.DataFrame(np.arange(12).reshape((2, 6)), index=list(string.ascii_uppercase[:2]), # # columns=list(string.ascii_uppercase[-10:-4])) # print(df) # # # 把時間戳轉換爲DataFrame格式 # # df["timeStamp"] = pd.to_datetime(df["timeStamp"],format="") # # print() # # # # format參數大部分狀況下能夠不用寫，可是對於pandas沒法格式化的時間字符串，咱們可使用該參數，好比包含中文 # # # # pandas重採樣 # # 重採樣： 指的是將時間序列從一個頻率轉化爲另外一個頻率進行處理的過程，將高頻率數據轉化爲低頻率數據爲降採樣，低頻率轉化爲高頻率爲升採樣 # tt = pd.DataFrame(np.random.uniform(10,50,(100,1)),index=pd.date_range(start='20180508',periods=100)) # print(tt) # print(tt.resample('M').mean()) # print(tt.resample('10D').count())


""" test5 北京pm2.5處理實例"""
# coding=utf-8
import pandas as pd from matplotlib import pyplot as plt file_path = "BeijingPM20100101_20151231.csv" df = pd.read_csv(file_path) # 把分開的時間字符串經過periodIndex的方法轉化爲pandas的時間類型
period = pd.PeriodIndex(year=df["year"], month=df["month"], day=df["day"], hour=df["hour"], freq="H") df["datetime"] = period # print(df.head(10))

# 把datetime 設置爲索引
df.set_index("datetime", inplace=True) # 進行降採樣
df = df.resample("7D").mean() print(df.head()) # 處理缺失數據，刪除缺失數據 # print(df["PM_US Post"])
 data = df["PM_US Post"] data_china = df["PM_Nongzhanguan"] print(data_china.head(100)) # 畫圖
_x = data.index _x = [i.strftime("%Y%m%d") for i in _x] _x_china = [i.strftime("%Y%m%d") for i in data_china.index] print(len(_x_china), len(_x_china)) _y = data.values _y_china = data_china.values plt.figure(figsize=(20, 8), dpi=80) plt.plot(range(len(_x)), _y, label="US_POST", alpha=0.7) plt.plot(range(len(_x_china)), _y_china, label="CN_POST", alpha=0.7) plt.xticks(range(0, len(_x_china), 10), list(_x_china)[::10], rotation=45) plt.legend(loc="best") plt.show()

最後：json

我的筆記以及資料下載 -》密碼：8yra數組