那年夏天抓住了蟬的尾巴
gitbookgit
前言
pandas 抓住 Series (排序的字典), DataFrame (row + 多個 Series) 對象 , 就如同 numpy 裏抓住 ndarray 多維數組同樣
但是人的精力始終是有限的,沒有過目不忘的本領,那就記住 API 以及經常使用參數, 其餘的交給字典吧
下面學習 示例 可能會用到的 兩個函數
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
generate_df(3,4)
修改 dataframe 中數據
from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars}{text}{stars}'.format(text=text,stars=stars))
data = {'open':[8.08, 7.93, 7.97, 8.00],
'close':[7.93,8.05,7.97,8.05],
'high':[8.10,8.12,8.00,8.09],
'low':[7.88,7.92,7.91,8.00]}
df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('華麗的分隔符')
df[~df.isin([7.93])] = 0 # 將 df 中不爲 6.93 的變爲 0
print(df)
"""
open close high low
2016-02-01 8.08 7.93 8.10 7.88
2016-02-02 7.93 8.05 8.12 7.92
2016-02-03 7.97 7.97 8.00 7.91
2016-02-04 8.00 8.05 8.09 8.00
********************華麗的分隔符********************
open close high low
2016-02-01 0.00 7.93 0.0 0.0
2016-02-02 7.93 0.00 0.0 0.0
2016-02-03 0.00 0.00 0.0 0.0
2016-02-04 0.00 0.00 0.0 0.0
"""
"""
df[] 這樣返回的都是 DataFrame , df.ix ,df.loc , df.iloc 這類返回的都是 Series
"""
apply + map == applymap
from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
from functools import reduce
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars}{text}{stars}'.format(text=text,stars=stars))
data = {'open':[8.08, 7.93, 7.97, 8.00],
'close':[7.93,8.05,7.97,8.05],
'high':[8.10,8.12,8.00,8.09],
'low':[7.88,7.92,7.91,8.00]}
df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('華麗的分隔線')
# 新增一列 下面兩種方法等價
df['new'] = df.apply(lambda cols:reduce(lambda x,y:x+y,cols), axis=1)
df['new_2'] = df.apply(sum,axis=1)
print(df.head(10))
"""
open close high low
2016-02-01 8.08 7.93 8.10 7.88
2016-02-02 7.93 8.05 8.12 7.92
2016-02-03 7.97 7.97 8.00 7.91
2016-02-04 8.00 8.05 8.09 8.00
********************華麗的分隔線********************
open close high low new new_2
2016-02-01 8.08 7.93 8.10 7.88 31.99 63.98
2016-02-02 7.93 8.05 8.12 7.92 32.02 64.04
2016-02-03 7.97 7.97 8.00 7.91 31.85 63.70
2016-02-04 8.00 8.05 8.09 8.00 32.14 64.28
"""
分組後 分組col 會被做爲 key 索引
from pandas import Series,DataFrame
a=[['Li','男','PE',98.],['Li','男','MATH',60.],['liu','男','MATH',60.],['yu','男','PE',100.]]
af=DataFrame(a,columns=['name','sex','course','score'])
print(af.head(10))
print('*'*50)
print(af.groupby(['name','course'])['score'].sum())
print('*'*50)
print(af.groupby(['name','course'])['score'].sum()['Li'])
"""
name sex course score
0 Li 男 PE 98.0
1 Li 男 MATH 60.0
2 liu 男 MATH 60.0
3 yu 男 PE 100.0
**************************************************
name course
Li MATH 60.0
PE 98.0
liu MATH 60.0
yu PE 100.0
Name: score, dtype: float64
**************************************************
course
MATH 60.0
PE 98.0
Name: score, dtype: float64
"""
騷操做
# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import pdb
addr = pd.Series([
'Washington, D.C. 20003',
'Brooklyn, NY 11211-1755',
'Omaha, NE 68154',
'Pittsburgh, PA 15211' ])
# Series._accessors 有 str , cat , dt 三個對象
addr.str.upper() # 將字符大寫
print(addr.str.count(r'\d')) # 計數 每個單元格包含數字個數
regex = (r'(?P<city>[A-Za-z ]+), ' # 一個或更多字母
r'(?P<state>[A-Z]{2}) ' # 兩個大寫字母
r'(?P<zip>\d{5}(?:-\d{4})?)') # 可選的4個延伸數字
print(addr.str.replace('.','').str.extract(regex))
print([i for i in dir(pd.Series.str) if not i.startswith('_')])
"""
0 5
1 9
2 5
3 5
dtype: int64
city state zip
0 Washington DC 20003
1 Brooklyn NY 11211-1755
2 Omaha NE 68154
3 Pittsburgh PA 15211
['capitalize', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'wrap', 'zfill']
"""
daterng = pd.Series(pd.date_range('2017', periods=9, freq='Q'))
print(daterng)
print(daterng.dt.day_name())
# 查看下半年
print(daterng[daterng.dt.quarter > 2])
print(daterng[daterng.dt.is_year_end])
"""
Series.dt.day_name():從日期判斷出所處星期數;
Series.dt.quarter:從日期判斷所處季節;
Series.dt.is_year_end:從日期判斷是否處在年末
"""
colors = pd.Series([
'periwinkle',
'mint green',
'burnt orange',
'periwinkle',
'burnt orange',
'rose',
'rose',
'mint green',
'rose',
'navy'])
import sys
print(colors.apply(sys.getsizeof))
mapper = {v: k for k, v in enumerate(colors.unique())}
as_int = colors.map(mapper)
print(as_int)
print(as_int.apply(sys.getsizeof))
# 節省內存
primary_usage = colors.memory_usage(index=False, deep=True)
category_usage = colors.astype('category').memory_usage(index=False, deep=True)
print('primary: {}\ncategory_usage: {}'.format(primary_usage,category_usage))
"""
primary: 370
category_usage: 291
這樣看起來彷佛並無什麼很大區別
可是咱們能夠 repeat 屢次試試
"""
manycolors = colors.repeat(10)
print(len(manycolors) / manycolors.nunique())
print(manycolors.memory_usage(index=False, deep=True))
# pdb.set_trace()
print(manycolors.astype('category').memory_usage(index=False, deep=True))
數據清洗與準備
# 處理缺失數據
API
dropna
fillna
isnull
notnull
# 過濾缺失值
data.dropna() 等價於 data[data.notnull()]
對於 DataFrame 還有 axis= 0 or 1 , how = 'all' 等可選 ,當某列全爲 NaN 時候刪除 , thresh=2 當 NA 個數 > = 2 時候觸發刪除操做
df.dropna(axis=1,how='all')
df.fillna({1: 0.5, 2: 0}, inplace=True, method='ffill', limit=2)爲 1 列 2 列 分別填充不一樣的默認值
fillna 參數 有:
value 標量值或字典型對象用於填充確實值
method 插值方法, 若是沒有其餘參數, 默認是 ‘ffill’
axis 須要填充的軸, 默認 axis = 0
inplace
limit 用於前向或後向填充時最大的填充範圍
### 查詢與刪除重複值
data.duplicated()
data.drop_duplicates(['col1','col2'], keep='last' or 'first')
### 使用函數或映射進行數據轉換
data = {'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
'ounces':[4,5,12,6,7.5,8,3,5,6]}
df = DataFrame(data)
print(df.head(10))
print('*'*50)
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
df['meat_to_animall'] = df['food'].map(lambda x:meat_to_animal.get(x.lower(),'unknown'))
print(df.head(10))
### 替代值
data.replace(-999, np.nan)
### 重命名軸索引
data.index.map(lambda x: x[:4].upper())
data.rename(index=str.title, columns=str.upper)
dta.rename(index={'old_idx':'new_idx'}, columns={'old_col':'new_col'}, inplace=True)
### 離散化和分箱 cut , qcut
pd.cut(ages, bins)
from pandas import Series,DataFrame
import pandas as pd
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,35,50,70,90,120]
cats = pd.cut(ages, bins)
print(cats.codes)
df = DataFrame({'ages':ages})
df['ages_dicretes'] = pd.cut(ages, bins,right=False).codes
print(df.head(10))
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.random.randn(1000,4))
data[np.abs(data)>3] = np.sign(data) * 3 # 將數值限定於 -3 to +3
print(data.head(10))
# 置換和隨機抽樣
numpy.random.permutation
df.sample , series.sample
df = DataFrame(np.arange(20).reshape((5,4)))
print(df)
print('*'*50)
sampler = np.random.permutation(5)
print(df.take(sampler))
# 組合使用
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
np.random.seed(12345)
values = np.random.rand(10)
print(values)
pretty_print('離散分箱')
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
df = pd.get_dummies(pd.cut(values,bins))
print(df.head(10))
"""
[0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
0.96451452 0.6531771 0.74890664 0.65356987]
******************** 離散分箱 ********************
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
0 0 0 0 0 1
1 0 1 0 0 0
2 1 0 0 0 0
3 0 1 0 0 0
4 0 0 1 0 0
5 0 0 1 0 0
6 0 0 0 0 1
7 0 0 0 1 0
8 0 0 0 1 0
9 0 0 0 1 0
"""
### 向量化 字符串函數
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
data = {'col1':['001100110111'],
'col2': ['001100110111'],
'col3': ['001100110111']}
df = DataFrame(data)
print(df.head(10))
pretty_print('華麗的分割線')
df2 = df.apply(lambda s:s.str.extract(r'(?P<nums_1>\d{3})(?P<nums_2>\d{3})(?P<nums_3>\d{3})(?P<nums_4>\d{3})') ,axis=1)
print(df2.values)
"""
col1 col2 col3
0 001100110111 001100110111 001100110111
******************** 華麗的分割線 ********************
[ nums_1 nums_2 nums_3 nums_4
col1 001 100 110 111
col2 001 100 110 111
col3 001 100 110 111]
"""
# 部分向量化字符串方法列表
cat 根據可選的分隔符暗元素年和字符串
contains 返回是否含有某個模式 / 正則表達式的 布爾值數組
count 模式出現次數的計數
extract 使用正則表達式從字符串Series 中分組抽取一個 或多個字符串, 返回的結果是 每一個分組造成一列的 DataFrame
endswith 等價於對每一個元素使用 x.endswith
startswith 等價於 對每一個元素使用 x.statswith
findall 找出字符串中全部的 模式 / 正則表達式 匹配項 ,以列表返回
get 對每一個元素進行索引 (得到第 i 個元素)
isalnum
is alhpa
isdecimal
isdigit
islower
isnumeric
isupper
join
len
loer, upper
match
pad
center
repeat
replace
slice
split
strip
rstrip
lstrip
第八章 數據規整 , 鏈接,聯合 重塑
# 分層索引 部分索引
分層索引容許你在一個軸向上擁有多個 (兩個或兩個以上) 索引層級, 龍宮地說 分層索引提供了一種在耕地惟獨的形式中處理更高維度 數據的方法。
# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data.head(10))
pretty_print('華麗的分割線')
print(data['b'])
pretty_print('華麗的分割線')
print(data.unstack())
pretty_print('華麗的分割線')
print(data.unstack().stack())
# 重排序 和 層級排序
swaplevel sort_index
# 按 層級進行彙總統計
# 使用 DataFrame 的列 進行索引
set_index() 提出 層級索引
reset_index() 反操做 封層索引會被移動到 列中
# 聯合於合併數據集
相似於 SQL 表關聯操做 merge
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
df1 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
'data1':range(7)})
df2 = pd.DataFrame({'key1':['a','b','d'],
'data2':range(3)})
df3 = pd.merge(df1,df2,left_on='key1',right_on='key1',how='inner', suffixes=('_left','_right'))
# inner ,left ,right ,outer ,若是 是多個列進行關聯 則 on = ['key1','key2'] ,若是有相同的列,則使用
# suffixes 參數 給列取別名 , 根據索引來合併 (關聯) left_index=True, right_index=True
print(df3.head(10))
# join 按照 索引進行合併
df4 = df1.join(df2,how='inner', lsuffix='_left', rsuffix='_right')
pretty_print('華麗的分割線')
print(df4.head(10))
left.join([right1, right2,right3], how='outer') 默認 inner
concat 相似於 union all 其實又不單單是
arr = np.arange(12).reshape((3,4))
print(arr)
pretty_print('華麗的分割線')
result = np.concatenate([arr,arr],axis=1)
result_2 = np.concatenate([arr,arr],axis=0)
print(result)
pretty_print('華麗的分割線')
print(result_2)
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
print(pd.concat([s1,s2,s3],axis=0,sort=False))
pretty_print('華麗的分割線')
print(pd.concat([s1,s2,s3],axis=1,sort=False))
"""
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
******************** 華麗的分割線 ********************
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
"""
s4 = pd.concat([s1,s3])
print(pd.concat([s1,s4],axis=1,join='inner',sort=False)) # join = outer or inner
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
s5 = pd.concat([s1,s2,s3],axis=0,keys=['one', 'two', 'three'],sort=False) # 能夠爲每個 series 作分層索引便於區分
print(s5)
pretty_print('華麗的分割線')
"""
one a 0
b 1
two c 2
d 3
e 4
three f 5
g 6
dtype: int64
******************** 華麗的分割線 ********************
a b c d e f g
one 0.0 1.0 NaN NaN NaN NaN NaN
two NaN NaN 2.0 3.0 4.0 NaN NaN
three NaN NaN NaN NaN NaN 5.0 6.0
"""
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
df1 = generate_df(3,2)
df2 = generate_df(2,2).applymap(lambda x:x+5)
print(df1)
pretty_print('華麗的分割線')
print(df2)
pretty_print('華麗的分割線')
print(pd.concat([df1,df2],axis=1,sort=False,keys=('lvl1','lvl2')))
"""
col_0 col_1
row_0 0 1
row_1 2 3
row_2 4 5
******************** 華麗的分割線 ********************
col_0 col_1
row_0 5 6
row_1 7 8
******************** 華麗的分割線 ********************
lvl1 lvl2
col_0 col_1 col_0 col_1
row_0 0 1 5.0 6.0
row_1 2 3 7.0 8.0
row_2 4 5 NaN NaN
"""
聯合重疊數據
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
print(a)
pretty_print('華麗的分割線')
print(b)
pretty_print('華麗的分割線')
print(np.where(pd.isnull(a),b,a))
"""
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
******************** 華麗的分割線 ********************
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a 5.0
dtype: float64
******************** 華麗的分割線 ********************
[0. 2.5 2. 3.5 4.5 5. ]
"""
print(b.combine_first(a))
8.3 重中之重 重塑或透視
stack
unstack
每每結合 分層索引來作
pivot <==> 等價於 set_index 建立分層索引, 而後調用 unstack 拆堆
pivot 反過來操做 就是 pd.melt
import pandas as pd
import numpy as np
"""
對比 某一行 第二三列的 差值 == 下一行 第一列的值,找出這樣的行
"""
data = np.zeros((20,3))
df = pd.DataFrame(data, columns=['col_'+str(i) for i in range(3)], index=['row_'+str(i) for i in range(20)])
df.iloc[:, 1:] = 1
print(df.head(10))
print('{stars} {text} {stars}'.format(stars='*'*20,text='華麗的分割線'))
def func(row):
return abs(row[1] - row[2])
df['col_3'] = df.apply(func, axis=1)
df['col_4'] = np.where(df['col_3'] == df['col_0'].shift(1), True, False)
df = df[df['col_4']]
print(df.head(20))
判斷相似 gene_1|gene_2 ==> gene_1 = 1 , gene_2 = 1 gene_3 = 0....
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re
def pretty_print(text='',star_cnt=20):
stars = '*'*star_cnt
print('{stars} {text} {stars}'.format(text=text,stars=stars))
def generate_df(rows, cols):
data = np.arange(rows*cols).reshape((rows,cols))
columns = ['col_'+str(i) for i in range(cols)]
indices = ['row_'+str(j) for j in range(rows)]
return DataFrame(data,columns=columns,index=indices)
df = generate_df(3,6)
print(df.head(10))
df['col_1'] = 'col_2|col_4'
df.ix[1:2,'col_1'] = 'col_3|col_5'
cols = [col for col in df.columns if col!='col_0']
def do_apply(row):
for col in cols:
print('*'*100)
if col in row['col_1'].split(r'|'):
tmp = row['col_0']
df.loc[df['col_0'] == tmp,col] = 1
df.apply(do_apply, axis=1)
# df.apply(lambda row:do_apply(row) ,axis=1)
print('df shape: {}'.format(df.shape))
print('df 前十行: ',df.head(10))