pandas 基礎

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

建立一個Series ,同時讓pandas自動生成索引列

s = pd.Series([1,3,5,np.nan,6,8])
# 查看s
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

建立一個DataFrame數據框

### 建立一個DataFrame ,能夠傳入一個numpy array 能夠本身構建索引以及列標
dates = pd.date_range('2018-11-01',periods=7)
#### 好比說生成一個時間序列,以20181101 爲起始位置的,7個日期組成的時間序列,數據的類型爲datetime64[ns]
dates
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(7,4),index= dates,columns=list('ABCD'))
df
# 產生隨機正態分佈的數據,7行4列,分別對應的index的長度以及column的長度
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 -0.721314
### 同時用可使用dict的實行建立DataFrame
df2 = pd.DataFrame({"A":1,
                   "B":"20181101",
                   'C':np.array([3]*4,dtype='int32'),
                   'D':pd.Categorical(['test','train','test','train']),
                   "E":1.5},
                  )
df2
A B C D E
0 1 20181101 3 test 1.5
1 1 20181101 3 train 1.5
2 1 20181101 3 test 1.5
3 1 20181101 3 train 1.5
df2.dtypes
### 查看數據框中的數據類型,常見的數據類型還有時間類型以及float類型
A       int64
B      object
C       int32
D    category
E     float64
dtype: object

查看數據

# 好比說看前5行
df.head()
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
# 後4行
df.tail(4)
A B C D
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 -0.721314
# 查看DataFrame的索引
df.index
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
# 查看DataFrame的列索引
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看DataFrame的數據,將DataFrame轉化爲numpy array 的數據形式
df.values
array([[ 2.19709382,  0.90891281, -0.64802911, -1.32554721],
       [ 0.35466158, -1.22424591, -0.50120854, -1.49017025],
       [-0.24583358, -1.04959585,  2.36622453,  0.6373212 ],
       [-0.6899396 ,  0.47128154, -1.41740143,  0.26890482],
       [-0.54804068, -0.84193368,  0.57312781, -1.05517487],
       [-0.6910726 ,  0.93301611,  1.85764662,  0.77552552],
       [ 0.46707509,  0.36240665,  2.31937488, -0.721314  ]])

數據的簡單統計

# 可使用describe函數對DataFrame中的數值型數據進行統計
df.describe()
A B C D
count 7.000000 7.000000 7.000000 7.000000
mean 0.120563 -0.062880 0.649962 -0.415779
std 1.031487 0.942664 1.553537 0.955789
min -0.691073 -1.224246 -1.417401 -1.490170
25% -0.618990 -0.945765 -0.574619 -1.190361
50% -0.245834 0.362407 0.573128 -0.721314
75% 0.410868 0.690097 2.088511 0.453113
max 2.197094 0.933016 2.366225 0.775526
df2.describe()
### 對於其餘的數據類型的數據describe函數會自動過濾掉
A C E
count 4.0 4.0 4.0
mean 1.0 3.0 1.5
std 0.0 0.0 0.0
min 1.0 3.0 1.5
25% 1.0 3.0 1.5
50% 1.0 3.0 1.5
75% 1.0 3.0 1.5
max 1.0 3.0 1.5
### DataFrame 的轉置,將列索引與行索引進行調換,行數據與列數進行調換
df.T
2018-11-01 00:00:00 2018-11-02 00:00:00 2018-11-03 00:00:00 2018-11-04 00:00:00 2018-11-05 00:00:00 2018-11-06 00:00:00 2018-11-07 00:00:00
A 2.197094 0.354662 -0.245834 -0.689940 -0.548041 -0.691073 0.467075
B 0.908913 -1.224246 -1.049596 0.471282 -0.841934 0.933016 0.362407
C -0.648029 -0.501209 2.366225 -1.417401 0.573128 1.857647 2.319375
D -1.325547 -1.490170 0.637321 0.268905 -1.055175 0.775526 -0.721314
df
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 -0.721314

數據的排序

df.sort_index(ascending=False)
### 降序,按照列進行降序,經過該索引列
A B C D
2018-11-07 0.467075 0.362407 2.319375 -0.721314
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
print(df.sort_values(by=['B','A']))
#  默認是升序,能夠選擇多指排序,先照B,後排A,若是B中的數據同樣,則按照A中的大小進行排序
df.sort_values(by='B')
A         B         C         D
2018-11-02  0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596  2.366225  0.637321
2018-11-05 -0.548041 -0.841934  0.573128 -1.055175
2018-11-07  0.467075  0.362407  2.319375 -0.721314
2018-11-04 -0.689940  0.471282 -1.417401  0.268905
2018-11-01  2.197094  0.908913 -0.648029 -1.325547
2018-11-06 -0.691073  0.933016  1.857647  0.775526
A B C D
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-07 0.467075 0.362407 2.319375 -0.721314
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-06 -0.691073 0.933016 1.857647 0.775526

選擇數據(相似於數據庫中sql語句)

df['A']
# 取出單獨的一列數據,等價於df.A
2018-11-01    2.197094
2018-11-02    0.354662
2018-11-03   -0.245834
2018-11-04   -0.689940
2018-11-05   -0.548041
2018-11-06   -0.691073
2018-11-07    0.467075
Freq: D, Name: A, dtype: float64
# 經過[]進行行選擇切片
df[0:3]
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
# 同時對於時間索引而言,能夠直接使用好比
df['2018-11-01':'2018-11-04']
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905

另外可使用標籤來選擇

df.loc['2018-11-01']
A    2.197094
B    0.908913
C   -0.648029
D   -1.325547
Name: 2018-11-01 00:00:00, dtype: float64
#### 經過標籤來進行多個軸上的進行選擇
df.loc[:,["A","B"]] # 等價於df[["A","B"]]
A B
2018-11-01 2.197094 0.908913
2018-11-02 0.354662 -1.224246
2018-11-03 -0.245834 -1.049596
2018-11-04 -0.689940 0.471282
2018-11-05 -0.548041 -0.841934
2018-11-06 -0.691073 0.933016
2018-11-07 0.467075 0.362407
df.loc["2018-11-01":"2018-11-03",["A","B"]]
A B
2018-11-01 2.197094 0.908913
2018-11-02 0.354662 -1.224246
2018-11-03 -0.245834 -1.049596
#### 得到一個標量數據
df.loc['2018-11-01','A']
2.1970938156943904

經過位置獲取數據

df.iloc[3]  # 得到第四行的數據
A   -0.689940
B    0.471282
C   -1.417401
D    0.268905
Name: 2018-11-04 00:00:00, dtype: float64
df.iloc[1:3,1:4]  #  與numpy中的ndarray相似
B C D
2018-11-02 -1.224246 -0.501209 -1.490170
2018-11-03 -1.049596 2.366225 0.637321
# 能夠選取不連續的行或者列進行取值
df.iloc[[1,3],[1,3]]
B D
2018-11-02 -1.224246 -1.490170
2018-11-04 0.471282 0.268905
#  對行進行切片處理
df.iloc[1:3,:]
A B C D
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
# 對列進行切片
df.iloc[:,1:4]
B C D
2018-11-01 0.908913 -0.648029 -1.325547
2018-11-02 -1.224246 -0.501209 -1.490170
2018-11-03 -1.049596 2.366225 0.637321
2018-11-04 0.471282 -1.417401 0.268905
2018-11-05 -0.841934 0.573128 -1.055175
2018-11-06 0.933016 1.857647 0.775526
2018-11-07 0.362407 2.319375 -0.721314
# 獲取特定的值
df.iloc[1,3]
-1.4901702546027098

布爾值索引

# 使用單列的數據做爲條件進行篩選
df[df.A>0]
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-07 0.467075 0.362407 2.319375 -0.721314
#不多用到,不多使用這種大範圍的條件進行篩選
df[df>0]
A B C D
2018-11-01 2.197094 0.908913 NaN NaN
2018-11-02 0.354662 NaN NaN NaN
2018-11-03 NaN NaN 2.366225 0.637321
2018-11-04 NaN 0.471282 NaN 0.268905
2018-11-05 NaN NaN 0.573128 NaN
2018-11-06 NaN 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 NaN
# 使用isin()方法過濾
df2.head()
A B C D E
0 1 20181101 3 test 1.5
1 1 20181101 3 train 1.5
2 1 20181101 3 test 1.5
3 1 20181101 3 train 1.5
df2[df2['D'].isin(['test'])]
A B C D E
0 1 20181101 3 test 1.5
2 1 20181101 3 test 1.5

設定數值(相似於sql update 或者add)

  • 設定一個新的列
df['E'] = [1,2,3,4,5,6,7]
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 1
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 2
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
  • 經過標籤設定新的值
df.loc['2018-11-01','E']= 10  # 第一行,E列的數據修改成10
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 10
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 2
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
df.iloc[1,4]=5000  # 第二行第五列數據修改成5000
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 10
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 5000
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
df3 =df.copy()
df3[df3<0]= -df3
df3  # 都變成非負數
A B C D E
2018-11-01 2.197094 0.908913 0.648029 1.325547 10
2018-11-02 0.354662 1.224246 0.501209 1.490170 5000
2018-11-03 0.245834 1.049596 2.366225 0.637321 3
2018-11-04 0.689940 0.471282 1.417401 0.268905 4
2018-11-05 0.548041 0.841934 0.573128 1.055175 5
2018-11-06 0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 0.721314 7

缺失值處理

df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 10
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 5000
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
df['E']=[1,np.nan,2,np.nan,4,np.nan,6]
df.loc['2018-11-01':'2018-11-03','D']=np.nan
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 0.354662 -1.224246 -0.501209 NaN NaN
2018-11-03 -0.245834 -1.049596 2.366225 NaN 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 NaN
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 NaN
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
  • 去掉缺失值的行
df4 = df.copy()
df4.dropna(how='any')
A B C D E
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
df4.dropna(how='all')
# """DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)""" 
# aixs 軸0或者1 index或者columns
# how 方式
# thresh 超過閾值個數的缺失值
# subset 那些字段的處理
# inplace 是否直接在原數據框中的替換
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 0.354662 -1.224246 -0.501209 NaN NaN
2018-11-03 -0.245834 -1.049596 2.366225 NaN 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 NaN
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 NaN
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
  • 對缺失值就行填充
df4.fillna(1000)
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 1000.000000 1.0
2018-11-02 0.354662 -1.224246 -0.501209 1000.000000 1000.0
2018-11-03 -0.245834 -1.049596 2.366225 1000.000000 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 1000.0
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 1000.0
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
  • 對數據進行布爾值進行填充
pd.isnull(df4)
A B C D E
2018-11-01 False False False True False
2018-11-02 False False False True True
2018-11-03 False False False True False
2018-11-04 False False False False True
2018-11-05 False False False False False
2018-11-06 False False False False True
2018-11-07 False False False False False

數據操做

#統計的工做通常狀況下都不包含缺失值,
df4.mean() 
#  默認是對列進行求平均,沿着行方向也就是axis=0
A    0.120563
B   -0.062880
C    0.649962
D   -0.183015
E    3.250000
dtype: float64
df4.mean(axis=1)
#  沿着列方向求每行的平均
2018-11-01    0.864494
2018-11-02   -0.456931
2018-11-03    0.767699
2018-11-04   -0.341789
2018-11-05    0.425596
2018-11-06    0.718779
2018-11-07    1.685509
Freq: D, dtype: float64
# 對於擁有不一樣維度,須要對齊的對象進行操做。Pandas會自動的沿着指定的維度進行廣播:
s = pd.Series([1,3,4,np.nan,6,7,8],index=dates)
s
2018-11-01    1.0
2018-11-02    3.0
2018-11-03    4.0
2018-11-04    NaN
2018-11-05    6.0
2018-11-06    7.0
2018-11-07    8.0
Freq: D, dtype: float64
df4.sub(s,axis='index')
A B C D E
2018-11-01 1.197094 -0.091087 -1.648029 NaN 0.0
2018-11-02 -2.645338 -4.224246 -3.501209 NaN NaN
2018-11-03 -4.245834 -5.049596 -1.633775 NaN -2.0
2018-11-04 NaN NaN NaN NaN NaN
2018-11-05 -6.548041 -6.841934 -5.426872 -7.055175 -2.0
2018-11-06 -7.691073 -6.066984 -5.142353 -6.224474 NaN
2018-11-07 -7.532925 -7.637593 -5.680625 -8.721314 -2.0
df4
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 0.354662 -1.224246 -0.501209 NaN NaN
2018-11-03 -0.245834 -1.049596 2.366225 NaN 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 NaN
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 NaN
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
df4.apply(np.cumsum)
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 2.551755 -0.315333 -1.149238 NaN NaN
2018-11-03 2.305922 -1.364929 1.216987 NaN 3.0
2018-11-04 1.615982 -0.893647 -0.200415 0.268905 NaN
2018-11-05 1.067942 -1.735581 0.372713 -0.786270 7.0
2018-11-06 0.376869 -0.802565 2.230360 -0.010745 NaN
2018-11-07 0.843944 -0.440158 4.549735 -0.732059 13.0
df4.apply(lambda x: x.max()-x.min())
A    2.888166
B    2.157262
C    3.783626
D    1.830700
E    5.000000
dtype: float64

統計個數與離散化

s = pd.Series(np.random.randint(0,7,size=15))
s
0     1
1     6
2     3
3     1
4     1
5     0
6     4
7     1
8     3
9     4
10    6
11    1
12    4
13    3
14    5
dtype: int32
s.value_counts()
# 統計元素的個數,並按照元素統計量進行排序,未出現的元素不會顯示出來
1    5
4    3
3    3
6    2
5    1
0    1
dtype: int64
s.reindex(range(0,7))
# 按照固定的順序輸出元素的個數統計
0    1
1    6
2    3
3    1
4    1
5    0
6    4
dtype: int32
s.mode()
#  衆數
0    1
dtype: int32
  • 離散化
# 連續值轉化爲離散值,可使用cut函數進行操做(bins based on vlaues) qcut (bins based on sample
# quantiles) 函數
arr = np.random.randint(0,20,size=15)  # 正態分佈
arr
array([ 3, 14, 10,  2,  2,  0, 17, 13,  7,  0, 15, 14,  4, 19,  9])
factor = pd.cut(arr,3)
factor
[(-0.019, 6.333], (12.667, 19.0], (6.333, 12.667], (-0.019, 6.333], (-0.019, 6.333], ..., (12.667, 19.0], (12.667, 19.0], (-0.019, 6.333], (12.667, 19.0], (6.333, 12.667]]
Length: 15
Categories (3, interval[float64]): [(-0.019, 6.333] < (6.333, 12.667] < (12.667, 19.0]]
pd.value_counts(factor)
(12.667, 19.0]     6
(-0.019, 6.333]    6
(6.333, 12.667]    3
dtype: int64
factor1 = pd.cut(arr,[-1,5,10,15,20])
pd.value_counts(factor1)
(-1, 5]     6
(10, 15]    4
(5, 10]     3
(15, 20]    2
dtype: int64
factor2 = pd.qcut(arr,[0,0.25,0.5,0.75,1])
pd.value_counts(factor2)
(9.0, 14.0]      4
(2.5, 9.0]       4
(-0.001, 2.5]    4
(14.0, 19.0]     3
dtype: int64
相關文章
相關標籤/搜索