pandas
- 處理表格等文件/數據庫
- 獲取數據(自定義,將來爬蟲獲取)
- 對數據邏輯處理,pandas+numpy
- 保存數據
- 支持文件存取操做,支持數據庫(sql)、html、json、pickle、csv(txt、excel)、sas、stata、hdf等。
import pandas as pd
import numpy as np
# 約定俗成
Series(瞭解)
# pd.Series([1, 2, 3, 4])
# 效果相同,只是上面的dtype是int64 (佔用的字節數更大)
pd.Series(np.array([1, 2, 3, 4]))
0 1
1 2
2 3
3 4
dtype: int32
DataFrame
pd.DataFrame(np.array([[1, 2, 3, 4], [5, 6, 7, 8]]))
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
4 |
1 |
5 |
6 |
7 |
8 |
# 首先是拿到日期
dates = pd.date_range('2019-01-01', periods=7)
# 從2019-01-01開始,計算7天
dates
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07'],
dtype='datetime64[ns]', freq='D')
# 再拿到商品名稱
goods_list = ['tesla', 'transformer', 'chongqiwawa,', 'masaladi']
# 再獲取商品價格信息
prices = np.random.rand(7, 4)
# 約定俗成df
# 裏面的信息 豎標題 橫標題
df = pd.DataFrame(prices, index=dates, columns=goods_list)
df
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.754105 |
0.096779 |
0.299980 |
0.327802 |
2019-01-02 |
0.816899 |
0.286751 |
0.513483 |
0.804952 |
2019-01-03 |
0.523050 |
0.410259 |
0.772978 |
0.086772 |
2019-01-04 |
0.419899 |
0.535284 |
0.946628 |
0.387901 |
2019-01-05 |
0.171370 |
0.921370 |
0.656765 |
0.346406 |
2019-01-06 |
0.810353 |
0.945966 |
0.048220 |
0.525464 |
2019-01-07 |
0.073864 |
0.951866 |
0.609959 |
0.338945 |
# 存入excel中
df.to_excel('test.xlsx')
內置方法
dtype |
查看數據類型 |
index |
查看行序列或者索引 |
columns |
查看各列的標籤 |
values |
查看數據框內的數據,也即不含表頭索引的數據 |
describe |
查看數據每一列的極值,均值,中位數,只可用於數值型數據 |
transpose |
轉置,也可用T來操做 |
sort_index |
排序,可按行或列index排序輸出 |
sort_values |
按數據值來排序 |
# 獲取每一列的數據類型
df.dtypes
tesla float64
transformer float64
chongqiwawa, float64
masaladi float64
dtype: object
# 獲取索引
df.index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07'],
dtype='datetime64[ns]', freq='D')
# 獲取列標題
df.columns
Index(['tesla', 'transformer', 'chongqiwawa,', 'masaladi'], dtype='object')
# 獲取值
df.values
array([[0.00890968, 0.05729652, 0.59607516, 0.22337263],
[0.88853146, 0.77845106, 0.97974385, 0.14025457],
[0.46915634, 0.89172479, 0.5544319 , 0.86177713],
[0.93810727, 0.02787091, 0.68399802, 0.72312706],
[0.46445576, 0.27518564, 0.63898171, 0.23633146],
[0.12982823, 0.72375128, 0.20697944, 0.86700956],
[0.98446901, 0.66713909, 0.2430983 , 0.41013451]])
# 把這些值獲取爲list格式
df.index.to_list()
['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07']
# 生成描述性統計
df.describe()
|
tesla |
transformer |
chongqiwawa, |
masaladi |
count |
7.000000 |
7.000000 |
7.000000 |
7.000000 |
mean |
0.554780 |
0.488774 |
0.557615 |
0.494572 |
std |
0.395182 |
0.359985 |
0.265876 |
0.315884 |
min |
0.008910 |
0.027871 |
0.206979 |
0.140255 |
25% |
0.297142 |
0.166241 |
0.398765 |
0.229852 |
50% |
0.469156 |
0.667139 |
0.596075 |
0.410135 |
75% |
0.913319 |
0.751101 |
0.661490 |
0.792452 |
max |
0.984469 |
0.891725 |
0.979744 |
0.867010 |
# 標題互換
df.transpose()
|
2019-01-01 00:00:00 |
2019-01-02 00:00:00 |
2019-01-03 00:00:00 |
2019-01-04 00:00:00 |
2019-01-05 00:00:00 |
2019-01-06 00:00:00 |
2019-01-07 00:00:00 |
tesla |
0.008910 |
0.888531 |
0.469156 |
0.938107 |
0.464456 |
0.129828 |
0.984469 |
transformer |
0.057297 |
0.778451 |
0.891725 |
0.027871 |
0.275186 |
0.723751 |
0.667139 |
chongqiwawa, |
0.596075 |
0.979744 |
0.554432 |
0.683998 |
0.638982 |
0.206979 |
0.243098 |
masaladi |
0.223373 |
0.140255 |
0.861777 |
0.723127 |
0.236331 |
0.867010 |
0.410135 |
# 按照index排序
df.sort_index()
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.008910 |
0.057297 |
0.596075 |
0.223373 |
2019-01-02 |
0.888531 |
0.778451 |
0.979744 |
0.140255 |
2019-01-03 |
0.469156 |
0.891725 |
0.554432 |
0.861777 |
2019-01-04 |
0.938107 |
0.027871 |
0.683998 |
0.723127 |
2019-01-05 |
0.464456 |
0.275186 |
0.638982 |
0.236331 |
2019-01-06 |
0.129828 |
0.723751 |
0.206979 |
0.867010 |
2019-01-07 |
0.984469 |
0.667139 |
0.243098 |
0.410135 |
# 對'tesla'的值進行排序,爲False則是倒序
df.sort_values(by=['tesla'], ascending=True)
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.008910 |
0.057297 |
0.596075 |
0.223373 |
2019-01-06 |
0.129828 |
0.723751 |
0.206979 |
0.867010 |
2019-01-05 |
0.464456 |
0.275186 |
0.638982 |
0.236331 |
2019-01-03 |
0.469156 |
0.891725 |
0.554432 |
0.861777 |
2019-01-02 |
0.888531 |
0.778451 |
0.979744 |
0.140255 |
2019-01-04 |
0.938107 |
0.027871 |
0.683998 |
0.723127 |
2019-01-07 |
0.984469 |
0.667139 |
0.243098 |
0.410135 |
處理缺失值
test_data = '''
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
from io import StringIO
test_data = StringIO(test_data) # office把這個字符串讀入內存
df = pd.read_csv(test_data) # 再處理成csv格式
df
|
5.1 |
Unnamed: 1 |
1.4 |
0.2 |
0 |
4.9 |
3.0 |
1.4 |
0.2 |
1 |
4.7 |
3.2 |
NaN |
0.2 |
2 |
7.0 |
3.2 |
4.7 |
1.4 |
3 |
6.4 |
3.2 |
4.5 |
1.5 |
4 |
6.9 |
3.1 |
4.9 |
NaN |
5 |
NaN |
NaN |
NaN |
NaN |
test_data = '''
c1,c2,c3,'c4'
5.1,,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,,0.2
7.0,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,
,,,
'''
from io import StringIO
test_data = StringIO(test_data) # office把這個字符串讀入內存
df = pd.read_csv(test_data) # 再處理成csv格式
# df.columns = ['c1', 'c2', 'c3', 'c4'] # 能夠使用這個添加列標題,可是會覆蓋掉最上面一排
df
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
# 這裏又成了0是行,1是列
df.dropna(axis=0)
|
c1 |
c2 |
c3 |
c4 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
df.dropna(axis=1)
# 把每一排至少有3個正常數據的值打印出來
df.dropna(thresh=3, axis=0)
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
# 把每一列至少有5個正常數據的值打印出來
df.dropna(thresh=5, axis=1)
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
NaN |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
6 |
NaN |
NaN |
NaN |
NaN |
df.columns
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')
# 把c2列中沒有缺失值的值打印出來
df.dropna(subset=['c2'])
|
c1 |
c2 |
c3 |
c4 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
NaN |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
NaN |
# 打印列標題
df.columns
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')
# 把缺失值所有賦值爲0
df.fillna(value=0)
|
c1 |
c2 |
c3 |
c4 |
0 |
5.1 |
0.0 |
1.4 |
0.2 |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
2 |
4.7 |
3.2 |
0.0 |
0.2 |
3 |
7.0 |
3.2 |
4.7 |
1.4 |
4 |
6.4 |
3.2 |
4.5 |
1.5 |
5 |
6.9 |
3.1 |
4.9 |
0.0 |
6 |
0.0 |
0.0 |
0.0 |
0.0 |
合併數據
df1 = pd.DataFrame(np.zeros((3, 4)))
df1
|
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
df2 = pd.DataFrame(np.ones((3, 4)))
df2
|
0 |
1 |
2 |
3 |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
# 這裏的axis又反了,因此不用記,用以前先去試一下就知道了
pd.concat((df1, df2), axis=0)
|
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
1.0 |
pd.concat((df1, df2), axis=1)
|
0 |
1 |
2 |
3 |
0 |
1 |
2 |
3 |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
1.0 |
取值
# 把以前存的excel取出來
df = pd.read_excel('test.xlsx', header=0, index_col=0)
df
|
tesla |
transformer |
chongqiwawa, |
masaladi |
2019-01-01 |
0.754105 |
0.096779 |
0.299980 |
0.327802 |
2019-01-02 |
0.816899 |
0.286751 |
0.513483 |
0.804952 |
2019-01-03 |
0.523050 |
0.410259 |
0.772978 |
0.086772 |
2019-01-04 |
0.419899 |
0.535284 |
0.946628 |
0.387901 |
2019-01-05 |
0.171370 |
0.921370 |
0.656765 |
0.346406 |
2019-01-06 |
0.810353 |
0.945966 |
0.048220 |
0.525464 |
2019-01-07 |
0.073864 |
0.951866 |
0.609959 |
0.338945 |
# 按照索引取值
df.loc['2019-01-01']
tesla 0.754105
transformer 0.096779
chongqiwawa, 0.299980
masaladi 0.327802
Name: 2019-01-01 00:00:00, dtype: float64
# 相似於numpy取值
df.iloc[0, 0]
0.7541054007912974
df.iloc[0, :]
tesla 0.754105
transformer 0.096779
chongqiwawa, 0.299980
masaladi 0.327802
Name: 2019-01-01 00:00:00, dtype: float64
把表格傳入excel文件中
df.to_excel('test.xlsx')
把表格從excel中取出來
df = pd.read_excel('test.xlsx', header=0, index_col=0)
高級(瞭解)
where,applypython