1 import pandas as pd 2 import numpy as np 3 4 df = pd.DataFrame({ 5 'key1': [4, 5, 3, np.nan, 2], 6 'key2': [1, 2, np.nan, 4, 5], 7 'key3': [1, 2, 3, 'j', 'k'] 8 }, index=['a', 'b', 'c', 'd', 'e']) 9 print(df) 10 print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype) 11 print('-------') 12 ''' 13 key1 key2 key3 14 a 4.0 1.0 1 15 b 5.0 2.0 2 16 c 3.0 NaN 3 17 d NaN 4.0 j 18 e 2.0 5.0 k 19 float64 float64 object 20 ------- 21 ''' 22 # 計算每一列的均值 df.mean() 23 # 只統計數字列,默認忽略nan。 24 print(df.mean()) 25 ''' 26 key1 3.5 27 key2 3.0 28 dtype: float64 29 ''' 30 # 不忽略nan值計算均值 31 # skipna默認爲True,若是爲False,有NaN的列統計結果仍爲NaN 32 m3 = df.mean(skipna=False) 33 print(m3) 34 ''' 35 key1 NaN 36 key2 NaN 37 dtype: float64 38 ''' 39 # 計算單一列的均值 40 print('計算單一列的均值',df['key2'].mean()) 41 ''' 42 計算單一列的均值 3.0 43 ''' 44 45 df2 = pd.DataFrame({ 46 'key1': [1, 3, 5], 47 'key2': [2, 4, 6], 48 'key3': [3, 5, 7] 49 }, index=['a', 'b', 'c']) 50 # print(df2) 51 # print('--------df2') 52 # 計算df2每一行的均值並將其結果添加到新的列 53 df2['mean'] = df2.mean(axis=1) 54 print(df2) 55 ''' 56 key1 key2 key3 mean 57 a 1 2 3 2.0 58 b 3 4 5 4.0 59 c 5 6 7 6.0 60 ''' 61 62 # 統計非NaN值的數量 count() 63 print(df) 64 print('-'*6) 65 print(df.count()) 66 ''' 67 key1 key2 key3 68 a 4.0 1.0 1 69 b 5.0 2.0 2 70 c 3.0 NaN 3 71 d NaN 4.0 j 72 e 2.0 5.0 k 73 ------ 74 key1 4 75 key2 4 76 key3 5 77 dtype: int64 78 ''' 79 80 # 統計 81 print(df) 82 print('-' * 6) 83 print('df的最小值',df.min()) 84 print('df的最大值',df.max()) 85 print('df的key2列的最大值',df['key2'].max()) 86 print('統計df的分位數,參數q肯定位置',df.quantile(q=0.75)) 87 print('對df求和',df.sum()) 88 print('求df的中位數,median(),50%分位數',df.median()) 89 print('求df的標準差,std()',df.std()) 90 print('求df的方差,var()',df.var()) 91 print('求skew樣本的偏度,skew()',df.skew()) 92 print('求kurt樣本的峯度,kurt()',df.kurt()) 93 print('df累計求和,cumsum()',df['key2'].cumsum()) 94 print('df累計求積,cumprod()',df['key2'].cumprod()) 95 print('求df的累計最大值,cummax()', df['key2'].cummax()) 96 print('求df的累計最小值,cummin()', df['key2'].cummin()) 97 ''' 98 key1 key2 key3 99 a 4.0 1.0 1 100 b 5.0 2.0 2 101 c 3.0 NaN 3 102 d NaN 4.0 j 103 e 2.0 5.0 k 104 ------ 105 df的最小值 key1 2.0 106 key2 1.0 107 dtype: float64 108 df的最大值 key1 5.0 109 key2 5.0 110 dtype: float64 111 df的key2列的最大值 5.0 112 統計df的分位數,參數q肯定位置 key1 4.25 113 key2 4.25 114 Name: 0.75, dtype: float64 115 對df求和 key1 14.0 116 key2 12.0 117 dtype: float64 118 求df的中位數,median(),50%分位數 key1 3.5 119 key2 3.0 120 dtype: float64 121 求df的標準差,std() key1 1.290994 122 key2 1.825742 123 dtype: float64 124 求df的方差,var() key1 1.666667 125 key2 3.333333 126 dtype: float64 127 求skew樣本的偏度,skew() key1 0.0 128 key2 0.0 129 dtype: float64 130 求kurt樣本的峯度,kurt() key1 -1.2 131 key2 -3.3 132 dtype: float64 133 df累計求和,cumsum() a 1.0 134 b 3.0 135 c NaN 136 d 7.0 137 e 12.0 138 Name: key2, dtype: float64 139 df累計求積,cumprod() a 1.0 140 b 2.0 141 c NaN 142 d 8.0 143 e 40.0 144 Name: key2, dtype: float64 145 求df的累計最大值,cummax() a 1.0 146 b 2.0 147 c NaN 148 d 4.0 149 e 5.0 150 Name: key2, dtype: float64 151 求df的累計最小值,cummin() a 1.0 152 b 1.0 153 c NaN 154 d 1.0 155 e 1.0 156 Name: key2, dtype: float64 157 ''' 158 159 # 惟一值 :unique() 160 s = pd.Series(list('kjdhsakjdhjfh')) 161 sq = s.unique() 162 print(s) 163 print(sq) 164 print('sq的類型:',type(sq)) 165 print('對sq進行從新排序:',pd.Series(sq).sort_values()) 166 ''' 167 0 k 168 1 j 169 2 d 170 3 h 171 4 s 172 5 a 173 6 k 174 7 j 175 8 d 176 9 h 177 10 j 178 11 f 179 12 h 180 dtype: object 181 ['k' 'j' 'd' 'h' 's' 'a' 'f'] 182 sq的類型: <class 'numpy.ndarray'> 183 對sq進行從新排序: 5 a 184 2 d 185 6 f 186 3 h 187 1 j 188 0 k 189 4 s 190 dtype: object 191 ''' 192 # 對某一列進行值的計數,只能對一列,不能對Dataframe 193 print(df['key2'].value_counts()) 194 195 # 判斷Dataframe中的每一個元素是否都是在某個列表中 196 print(df) 197 df_isin = df.isin([1,3]) 198 print(df_isin) 199 ''' 200 key1 key2 key3 201 a 4.0 1.0 1 202 b 5.0 2.0 2 203 c 3.0 NaN 3 204 d NaN 4.0 j 205 e 2.0 5.0 k 206 207 208 key1 key2 key3 209 a False True True 210 b False False False 211 c True False True 212 d False False False 213 e False False False 214 '''