import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
建立一個Series ,同時讓pandas自動生成索引列
s = pd.Series([1,3,5,np.nan,6,8])
# 查看s
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
建立一個DataFrame數據框
### 建立一個DataFrame ,能夠傳入一個numpy array 能夠本身構建索引以及列標
dates = pd.date_range('2018-11-01',periods=7)
#### 好比說生成一個時間序列,以20181101 爲起始位置的,7個日期組成的時間序列,數據的類型爲datetime64[ns]
dates
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
'2018-11-05', '2018-11-06', '2018-11-07'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(7,4),index= dates,columns=list('ABCD'))
df
# 產生隨機正態分佈的數據,7行4列,分別對應的index的長度以及column的長度
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
### 同時用可使用dict的實行建立DataFrame
df2 = pd.DataFrame({"A":1,
"B":"20181101",
'C':np.array([3]*4,dtype='int32'),
'D':pd.Categorical(['test','train','test','train']),
"E":1.5},
)
df2
|
A |
B |
C |
D |
E |
0 |
1 |
20181101 |
3 |
test |
1.5 |
1 |
1 |
20181101 |
3 |
train |
1.5 |
2 |
1 |
20181101 |
3 |
test |
1.5 |
3 |
1 |
20181101 |
3 |
train |
1.5 |
df2.dtypes
### 查看數據框中的數據類型,常見的數據類型還有時間類型以及float類型
A int64
B object
C int32
D category
E float64
dtype: object
查看數據
# 好比說看前5行
df.head()
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
# 後4行
df.tail(4)
|
A |
B |
C |
D |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
# 查看DataFrame的索引
df.index
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
'2018-11-05', '2018-11-06', '2018-11-07'],
dtype='datetime64[ns]', freq='D')
# 查看DataFrame的列索引
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看DataFrame的數據,將DataFrame轉化爲numpy array 的數據形式
df.values
array([[ 2.19709382, 0.90891281, -0.64802911, -1.32554721],
[ 0.35466158, -1.22424591, -0.50120854, -1.49017025],
[-0.24583358, -1.04959585, 2.36622453, 0.6373212 ],
[-0.6899396 , 0.47128154, -1.41740143, 0.26890482],
[-0.54804068, -0.84193368, 0.57312781, -1.05517487],
[-0.6910726 , 0.93301611, 1.85764662, 0.77552552],
[ 0.46707509, 0.36240665, 2.31937488, -0.721314 ]])
數據的簡單統計
# 可使用describe函數對DataFrame中的數值型數據進行統計
df.describe()
|
A |
B |
C |
D |
count |
7.000000 |
7.000000 |
7.000000 |
7.000000 |
mean |
0.120563 |
-0.062880 |
0.649962 |
-0.415779 |
std |
1.031487 |
0.942664 |
1.553537 |
0.955789 |
min |
-0.691073 |
-1.224246 |
-1.417401 |
-1.490170 |
25% |
-0.618990 |
-0.945765 |
-0.574619 |
-1.190361 |
50% |
-0.245834 |
0.362407 |
0.573128 |
-0.721314 |
75% |
0.410868 |
0.690097 |
2.088511 |
0.453113 |
max |
2.197094 |
0.933016 |
2.366225 |
0.775526 |
df2.describe()
### 對於其餘的數據類型的數據describe函數會自動過濾掉
|
A |
C |
E |
count |
4.0 |
4.0 |
4.0 |
mean |
1.0 |
3.0 |
1.5 |
std |
0.0 |
0.0 |
0.0 |
min |
1.0 |
3.0 |
1.5 |
25% |
1.0 |
3.0 |
1.5 |
50% |
1.0 |
3.0 |
1.5 |
75% |
1.0 |
3.0 |
1.5 |
max |
1.0 |
3.0 |
1.5 |
### DataFrame 的轉置,將列索引與行索引進行調換,行數據與列數進行調換
df.T
|
2018-11-01 00:00:00 |
2018-11-02 00:00:00 |
2018-11-03 00:00:00 |
2018-11-04 00:00:00 |
2018-11-05 00:00:00 |
2018-11-06 00:00:00 |
2018-11-07 00:00:00 |
A |
2.197094 |
0.354662 |
-0.245834 |
-0.689940 |
-0.548041 |
-0.691073 |
0.467075 |
B |
0.908913 |
-1.224246 |
-1.049596 |
0.471282 |
-0.841934 |
0.933016 |
0.362407 |
C |
-0.648029 |
-0.501209 |
2.366225 |
-1.417401 |
0.573128 |
1.857647 |
2.319375 |
D |
-1.325547 |
-1.490170 |
0.637321 |
0.268905 |
-1.055175 |
0.775526 |
-0.721314 |
df
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
數據的排序
df.sort_index(ascending=False)
### 降序,按照列進行降序,經過該索引列
|
A |
B |
C |
D |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
print(df.sort_values(by=['B','A']))
# 默認是升序,能夠選擇多指排序,先照B,後排A,若是B中的數據同樣,則按照A中的大小進行排序
df.sort_values(by='B')
A B C D
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-07 0.467075 0.362407 2.319375 -0.721314
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-06 -0.691073 0.933016 1.857647 0.775526
|
A |
B |
C |
D |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
選擇數據(相似於數據庫中sql語句)
df['A']
# 取出單獨的一列數據,等價於df.A
2018-11-01 2.197094
2018-11-02 0.354662
2018-11-03 -0.245834
2018-11-04 -0.689940
2018-11-05 -0.548041
2018-11-06 -0.691073
2018-11-07 0.467075
Freq: D, Name: A, dtype: float64
# 經過[]進行行選擇切片
df[0:3]
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
# 同時對於時間索引而言,能夠直接使用好比
df['2018-11-01':'2018-11-04']
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
另外可使用標籤來選擇
df.loc['2018-11-01']
A 2.197094
B 0.908913
C -0.648029
D -1.325547
Name: 2018-11-01 00:00:00, dtype: float64
#### 經過標籤來進行多個軸上的進行選擇
df.loc[:,["A","B"]] # 等價於df[["A","B"]]
|
A |
B |
2018-11-01 |
2.197094 |
0.908913 |
2018-11-02 |
0.354662 |
-1.224246 |
2018-11-03 |
-0.245834 |
-1.049596 |
2018-11-04 |
-0.689940 |
0.471282 |
2018-11-05 |
-0.548041 |
-0.841934 |
2018-11-06 |
-0.691073 |
0.933016 |
2018-11-07 |
0.467075 |
0.362407 |
df.loc["2018-11-01":"2018-11-03",["A","B"]]
|
A |
B |
2018-11-01 |
2.197094 |
0.908913 |
2018-11-02 |
0.354662 |
-1.224246 |
2018-11-03 |
-0.245834 |
-1.049596 |
#### 得到一個標量數據
df.loc['2018-11-01','A']
2.1970938156943904
經過位置獲取數據
df.iloc[3] # 得到第四行的數據
A -0.689940
B 0.471282
C -1.417401
D 0.268905
Name: 2018-11-04 00:00:00, dtype: float64
df.iloc[1:3,1:4] # 與numpy中的ndarray相似
|
B |
C |
D |
2018-11-02 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-1.049596 |
2.366225 |
0.637321 |
# 能夠選取不連續的行或者列進行取值
df.iloc[[1,3],[1,3]]
|
B |
D |
2018-11-02 |
-1.224246 |
-1.490170 |
2018-11-04 |
0.471282 |
0.268905 |
# 對行進行切片處理
df.iloc[1:3,:]
|
A |
B |
C |
D |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
# 對列進行切片
df.iloc[:,1:4]
|
B |
C |
D |
2018-11-01 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-03 |
-1.049596 |
2.366225 |
0.637321 |
2018-11-04 |
0.471282 |
-1.417401 |
0.268905 |
2018-11-05 |
-0.841934 |
0.573128 |
-1.055175 |
2018-11-06 |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.362407 |
2.319375 |
-0.721314 |
# 獲取特定的值
df.iloc[1,3]
-1.4901702546027098
布爾值索引
# 使用單列的數據做爲條件進行篩選
df[df.A>0]
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
#不多用到,不多使用這種大範圍的條件進行篩選
df[df>0]
|
A |
B |
C |
D |
2018-11-01 |
2.197094 |
0.908913 |
NaN |
NaN |
2018-11-02 |
0.354662 |
NaN |
NaN |
NaN |
2018-11-03 |
NaN |
NaN |
2.366225 |
0.637321 |
2018-11-04 |
NaN |
0.471282 |
NaN |
0.268905 |
2018-11-05 |
NaN |
NaN |
0.573128 |
NaN |
2018-11-06 |
NaN |
0.933016 |
1.857647 |
0.775526 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
NaN |
# 使用isin()方法過濾
df2.head()
|
A |
B |
C |
D |
E |
0 |
1 |
20181101 |
3 |
test |
1.5 |
1 |
1 |
20181101 |
3 |
train |
1.5 |
2 |
1 |
20181101 |
3 |
test |
1.5 |
3 |
1 |
20181101 |
3 |
train |
1.5 |
df2[df2['D'].isin(['test'])]
|
A |
B |
C |
D |
E |
0 |
1 |
20181101 |
3 |
test |
1.5 |
2 |
1 |
20181101 |
3 |
test |
1.5 |
設定數值(相似於sql update 或者add)
df['E'] = [1,2,3,4,5,6,7]
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
1 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df.loc['2018-11-01','E']= 10 # 第一行,E列的數據修改成10
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
10 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
2 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df.iloc[1,4]=5000 # 第二行第五列數據修改成5000
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
10 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
5000 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df3 =df.copy()
df3[df3<0]= -df3
df3 # 都變成非負數
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
0.648029 |
1.325547 |
10 |
2018-11-02 |
0.354662 |
1.224246 |
0.501209 |
1.490170 |
5000 |
2018-11-03 |
0.245834 |
1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
0.689940 |
0.471282 |
1.417401 |
0.268905 |
4 |
2018-11-05 |
0.548041 |
0.841934 |
0.573128 |
1.055175 |
5 |
2018-11-06 |
0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
0.721314 |
7 |
缺失值處理
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
-1.325547 |
10 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
-1.490170 |
5000 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
0.637321 |
3 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
4 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
5 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
6 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
7 |
df['E']=[1,np.nan,2,np.nan,4,np.nan,6]
df.loc['2018-11-01':'2018-11-03','D']=np.nan
df
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
NaN |
NaN |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
NaN |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
NaN |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
NaN |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4 = df.copy()
df4.dropna(how='any')
|
A |
B |
C |
D |
E |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4.dropna(how='all')
# """DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)"""
# aixs 軸0或者1 index或者columns
# how 方式
# thresh 超過閾值個數的缺失值
# subset 那些字段的處理
# inplace 是否直接在原數據框中的替換
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
NaN |
NaN |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
NaN |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
NaN |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
NaN |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4.fillna(1000)
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
1000.000000 |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
1000.000000 |
1000.0 |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
1000.000000 |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
1000.0 |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
1000.0 |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
pd.isnull(df4)
|
A |
B |
C |
D |
E |
2018-11-01 |
False |
False |
False |
True |
False |
2018-11-02 |
False |
False |
False |
True |
True |
2018-11-03 |
False |
False |
False |
True |
False |
2018-11-04 |
False |
False |
False |
False |
True |
2018-11-05 |
False |
False |
False |
False |
False |
2018-11-06 |
False |
False |
False |
False |
True |
2018-11-07 |
False |
False |
False |
False |
False |
數據操做
#統計的工做通常狀況下都不包含缺失值,
df4.mean()
# 默認是對列進行求平均,沿着行方向也就是axis=0
A 0.120563
B -0.062880
C 0.649962
D -0.183015
E 3.250000
dtype: float64
df4.mean(axis=1)
# 沿着列方向求每行的平均
2018-11-01 0.864494
2018-11-02 -0.456931
2018-11-03 0.767699
2018-11-04 -0.341789
2018-11-05 0.425596
2018-11-06 0.718779
2018-11-07 1.685509
Freq: D, dtype: float64
# 對於擁有不一樣維度,須要對齊的對象進行操做。Pandas會自動的沿着指定的維度進行廣播:
s = pd.Series([1,3,4,np.nan,6,7,8],index=dates)
s
2018-11-01 1.0
2018-11-02 3.0
2018-11-03 4.0
2018-11-04 NaN
2018-11-05 6.0
2018-11-06 7.0
2018-11-07 8.0
Freq: D, dtype: float64
df4.sub(s,axis='index')
|
A |
B |
C |
D |
E |
2018-11-01 |
1.197094 |
-0.091087 |
-1.648029 |
NaN |
0.0 |
2018-11-02 |
-2.645338 |
-4.224246 |
-3.501209 |
NaN |
NaN |
2018-11-03 |
-4.245834 |
-5.049596 |
-1.633775 |
NaN |
-2.0 |
2018-11-04 |
NaN |
NaN |
NaN |
NaN |
NaN |
2018-11-05 |
-6.548041 |
-6.841934 |
-5.426872 |
-7.055175 |
-2.0 |
2018-11-06 |
-7.691073 |
-6.066984 |
-5.142353 |
-6.224474 |
NaN |
2018-11-07 |
-7.532925 |
-7.637593 |
-5.680625 |
-8.721314 |
-2.0 |
df4
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
0.354662 |
-1.224246 |
-0.501209 |
NaN |
NaN |
2018-11-03 |
-0.245834 |
-1.049596 |
2.366225 |
NaN |
2.0 |
2018-11-04 |
-0.689940 |
0.471282 |
-1.417401 |
0.268905 |
NaN |
2018-11-05 |
-0.548041 |
-0.841934 |
0.573128 |
-1.055175 |
4.0 |
2018-11-06 |
-0.691073 |
0.933016 |
1.857647 |
0.775526 |
NaN |
2018-11-07 |
0.467075 |
0.362407 |
2.319375 |
-0.721314 |
6.0 |
df4.apply(np.cumsum)
|
A |
B |
C |
D |
E |
2018-11-01 |
2.197094 |
0.908913 |
-0.648029 |
NaN |
1.0 |
2018-11-02 |
2.551755 |
-0.315333 |
-1.149238 |
NaN |
NaN |
2018-11-03 |
2.305922 |
-1.364929 |
1.216987 |
NaN |
3.0 |
2018-11-04 |
1.615982 |
-0.893647 |
-0.200415 |
0.268905 |
NaN |
2018-11-05 |
1.067942 |
-1.735581 |
0.372713 |
-0.786270 |
7.0 |
2018-11-06 |
0.376869 |
-0.802565 |
2.230360 |
-0.010745 |
NaN |
2018-11-07 |
0.843944 |
-0.440158 |
4.549735 |
-0.732059 |
13.0 |
df4.apply(lambda x: x.max()-x.min())
A 2.888166
B 2.157262
C 3.783626
D 1.830700
E 5.000000
dtype: float64
統計個數與離散化
s = pd.Series(np.random.randint(0,7,size=15))
s
0 1
1 6
2 3
3 1
4 1
5 0
6 4
7 1
8 3
9 4
10 6
11 1
12 4
13 3
14 5
dtype: int32
s.value_counts()
# 統計元素的個數,並按照元素統計量進行排序,未出現的元素不會顯示出來
1 5
4 3
3 3
6 2
5 1
0 1
dtype: int64
s.reindex(range(0,7))
# 按照固定的順序輸出元素的個數統計
0 1
1 6
2 3
3 1
4 1
5 0
6 4
dtype: int32
s.mode()
# 衆數
0 1
dtype: int32
# 連續值轉化爲離散值,可使用cut函數進行操做(bins based on vlaues) qcut (bins based on sample
# quantiles) 函數
arr = np.random.randint(0,20,size=15) # 正態分佈
arr
array([ 3, 14, 10, 2, 2, 0, 17, 13, 7, 0, 15, 14, 4, 19, 9])
factor = pd.cut(arr,3)
factor
[(-0.019, 6.333], (12.667, 19.0], (6.333, 12.667], (-0.019, 6.333], (-0.019, 6.333], ..., (12.667, 19.0], (12.667, 19.0], (-0.019, 6.333], (12.667, 19.0], (6.333, 12.667]]
Length: 15
Categories (3, interval[float64]): [(-0.019, 6.333] < (6.333, 12.667] < (12.667, 19.0]]
pd.value_counts(factor)
(12.667, 19.0] 6
(-0.019, 6.333] 6
(6.333, 12.667] 3
dtype: int64
factor1 = pd.cut(arr,[-1,5,10,15,20])
pd.value_counts(factor1)
(-1, 5] 6
(10, 15] 4
(5, 10] 3
(15, 20] 2
dtype: int64
factor2 = pd.qcut(arr,[0,0.25,0.5,0.75,1])
pd.value_counts(factor2)
(9.0, 14.0] 4
(2.5, 9.0] 4
(-0.001, 2.5] 4
(14.0, 19.0] 3
dtype: int64