http://www.cnblogs.com/batteryhp/p/5046450.htmlhtml
對數據進行分組並對各組應用一個函數,是數據分析的重要環節。數據準備好以後,一般的任務就是計算分組統計或生成透視表。groupby函數能高效處理數據,對數據進行切片、切塊、摘要等操做。能夠看出這跟SQL關係密切,可是可用的函數有不少。在本章中,能夠學到:python
對時間數據的聚合也稱重採樣(resampling),在第十章介紹。數組
一、GroupBy技術app
不少數據處理過程都經歷「拆分-應用-合併」的過程。即根據一個或多個鍵進行分組、每個應用函數、再進行合併。dom
分組鍵有多種形式:函數
下面開始寫例子。大數據
#-*- encoding: utf-8 –*-
#分組實例 import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame df =DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)}) print df,'\n' #根據key1進行分組,並計算data1的均值。 #注意下面的方式,取出來進行分組,而不是在DataFrame中分組,這種方式很靈活 #能夠看到這是一個GroupBy對象,具有了應用函數的基礎 #這個過程是將Seri進行聚合,產生了新的Series grouped = df['data1'].groupby(df['key1']) print grouped,'\n' print grouped.mean(),'\n' means = df['data1'].groupby([df['key1'],df['key2']]).mean() print means,'\n' #獲得一個層次化索引的DataFrame print means.unstack(),'\n' #上面的分組鍵均爲Series,實際上,分組鍵能夠是任何長度適當的數組,很靈活 states = np.array(['Ohio','California','California','Ohio','Ohio']) years = np.array([2005,2005,2006,2005,2006]) print df['data1'].groupby([states,years]).mean(),'\n' #還能夠用列名(能夠是字符串、數字或其餘python對象)用做分組鍵 print df.groupby('key1').mean(),'\n' #這裏將數值型的列都進行了mean,非數值型的忽略 print df.groupby(['key1','key2']).mean(),'\n' #groupby之後能夠應用一個頗有用的size方法 print df.groupby(['key1','key2']).size(),'\n' #截止翻譯版爲止,分組鍵中的缺失值被排除在外
>>>
data1 data2 key1 key2
0 1.489789 -1.548474 a one
1 -1.000447 -0.187066 a two
2 0.254255 -0.960017 b one
3 1.279892 1.124993 b two
4 -0.366753 0.139047 a one
<pandas.core.groupby.SeriesGroupBy object at 0x03A895B0>優化
key1
a 0.040863
b 0.767073spa
key1 key2
a one 0.561518
two -1.000447
b one 0.254255
two 1.279892翻譯
key2 one two
key1
a 0.561518 -1.000447
b 0.254255 1.279892
California 2005 -1.000447
2006 0.254255
Ohio 2005 1.384841
2006 -0.366753
data1 data2
key1
a 0.040863 -0.532165
b 0.767073 0.082488
data1 data2
key1 key2
a one 0.561518 -0.704714
two -1.000447 -0.187066
b one 0.254255 -0.960017
two 1.279892 1.124993
key1 key2
a one 2
two 1
b one 1
two 1
[Finished in 0.7s]
#-*- encoding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame df =DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)}) #對分組進行迭代,groupby對象支持迭代 #下面是迭代?……不過就是將分組的結果分別賦值給兩個量?不是這樣的,下面的循環會打印兩個one print df.groupby('key1') for name,group in df.groupby('key1'): print 'one' print name print group,'\n' #多重鍵的狀況,元組的第一個元素將會是由鍵值組成的元組,下面會打印四個two #也就是說,下面的三個print是一個組合,打印key值這一點挺好 for (k1,k2),group in df.groupby(['key1','key2']): print 'two' print k1,k2 print group,'\n' #固然,能夠對數據片斷進行操做 #轉換爲字典,應該是比較有用的一個轉換方式 print list(df.groupby('key1')),'\n' pieces = dict(list(df.groupby('key1'))) #注意下面的字典中的每一個值仍然是一個「含有名稱的DataFrame」,可能不嚴謹,可是就是這意思 print pieces['a'],'\n' print type(pieces['a']) print pieces['a'][['data1','data2']],'\n' #groupby默認在axis = 0上進行分組,能夠設置在任何軸上分組 #下面用dtype對列進行分組 print df.dtypes,'\n' grouped = df.groupby(df.dtypes,axis = 1) print grouped,'\n' print dict(list(grouped)) #有點像把不一樣數值類型的列選出來
>>>
<pandas.core.groupby.DataFrameGroupBy object at 0x0333CEB0>
one
a
data1 data2 key1 key2
0 -0.984933 0.392220 a one
1 -2.104506 4.120798 a two
4 -0.267432 -1.825800 a one
one
b
data1 data2 key1 key2
2 0.476850 -1.738739 b one
3 -0.863738 -0.458431 b two
two
a one
data1 data2 key1 key2
0 -0.984933 0.39222 a one
4 -0.267432 -1.82580 a one
two
a two
data1 data2 key1 key2
1 -2.104506 4.120798 a two
two
b one
data1 data2 key1 key2
2 0.47685 -1.738739 b one
two
b two
data1 data2 key1 key2
3 -0.863738 -0.458431 b two
[('a', data1 data2 key1 key2
0 -0.984933 0.392220 a one
1 -2.104506 4.120798 a two
4 -0.267432 -1.825800 a one), ('b', data1 data2 key1 key2
2 0.476850 -1.738739 b one
3 -0.863738 -0.458431 b two)]
data1 data2 key1 key2
0 -0.984933 0.392220 a one
1 -2.104506 4.120798 a two
4 -0.267432 -1.825800 a one
<class 'pandas.core.frame.DataFrame'>
data1 data2
0 -0.984933 0.392220
1 -2.104506 4.120798
4 -0.267432 -1.825800
data1 float64
data2 float64
key1 object
key2 object
<pandas.core.groupby.DataFrameGroupBy object at 0x033F0190>
{dtype('object'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one, dtype('float64'): data1 data2
0 -0.984933 0.392220
1 -2.104506 4.120798
2 0.476850 -1.738739
3 -0.863738 -0.458431
4 -0.267432 -1.825800}
[Finished in 0.7s]
#-*- encoding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame df =DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)}) print df,'\n' #對於由DataFrame產生的GroupBy對象,若是用一個或一組列名進行索引,就能實現選取部分列進行聚合的目的,即 #下面語法效果相同 print df.groupby('key1')['data1'] #又一次選取方式的區分,這條語句返回Series,下一條返回DataFrame print df.groupby('key1')[['data1']] #下面的 print df['data1'].groupby(df['key1']) print df[['data1']].groupby(df['key1']),'\n' #尤爲對於大數據集,可能只是對部分列進行聚合。好比,想計算data2的均值並返回DataFrame print df.groupby(['key1','key2'])[['data2']].mean(),'\n'
>>>
data1 data2 key1 key2
0 -1.381889 0.919518 a one
1 -0.186802 1.265642 a two
2 -0.173303 0.866173 b one
3 0.015841 -0.601375 b two
4 -0.281338 -0.319804 a one
<pandas.core.groupby.SeriesGroupBy object at 0x039EB970>
<pandas.core.groupby.DataFrameGroupBy object at 0x039EB930>
<pandas.core.groupby.SeriesGroupBy object at 0x039EB930>
<pandas.core.groupby.DataFrameGroupBy object at 0x039EB950>
data2
key1 key2
a one 0.299857
two 1.265642
b one 0.866173
two -0.601375
[Finished in 0.7s]
#-*- encoding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis']) people.ix[2:3,['b','c']] = np.nan #加點NaN print people,'\n' #假設已經知道列的分組方式,如今須要利用這個信息進行分組統計: mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'} #下面爲groupby傳入一個已知信息的字典 by_column = people.groupby(mapping,axis = 1) print by_column.sum(),'\n' #注意獲得的名字是 red 和 blue #Series也有這樣的功能,被看做一個固定大小的映射,能夠用Series做爲分組鍵,pandas會自動檢查對齊 map_series = Series(mapping) print map_series,'\n' print people.groupby(map_series,axis = 1).count()
>>>
a b c d e
Joe -0.344808 0.716334 1.092892 0.824548 0.206477
Steve 0.457156 -0.207056 -0.447555 -0.378811 -0.581657
Wes -0.739237 NaN NaN -1.168591 0.876174
Jim 0.116797 -1.888764 2.072722 0.029644 0.919705
Travis -0.482019 1.479823 0.706617 0.697408 -0.914512
blue red
Joe 1.917440 0.578003
Steve -0.826367 -0.331557
Wes -1.168591 0.136937
Jim 2.102366 -0.852261
Travis 1.404025 0.083292
a red
b red
c blue
d blue
e red
f orange
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3
[Finished in 0.6s]
#-*- encoding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame #相較於字典或Series,python函數在定義分組映射關係時能夠更有創意且更爲抽象。 #函數會在各個索引值上調用一次,並根據結果進行分組。 people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis']) print people.groupby(len).sum() #名字長度相同的人進行加和 #將函數、數組、字典、Series混用也ok,由於最終都會轉換爲數組 key_list = ['one','one','one','two','two'] print people.groupby([len,key_list]).min()
>>>
a b c d e
3 0.528550 0.245731 1.187483 -1.086821 0.042086
5 -2.579143 0.152800 -0.911028 0.328152 0.627507
6 2.328199 -1.091351 -1.198069 0.571550 0.794774
a b c d e
3 one -0.444315 0.559996 -1.486260 0.090243 -1.131864
two -0.601314 -1.389457 1.616836 -1.366003 1.495320
5 one -2.579143 0.152800 -0.911028 0.328152 0.627507
6 two 2.328199 -1.091351 -1.198069 0.571550 0.794774
[Finished in 1.5s]
#-*- encoding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame #層次化索引數據集最方便的地方就在於它可以根據索引級別進行聚合。要實現該目的,只要經過level關鍵字傳入級別編號或名稱便可 columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names = ['cty','tenor']) hier_df = DataFrame(np.random.randn(4,5),columns = columns) print hier_df,'\n' print hier_df.groupby(level = 'cty',axis = 1).count(),'\n' print hier_df.groupby(level = 'tenor',axis = 1).count(),'\n' print hier_df.groupby(level = ['cty','tenor'],axis = 1).count()
>>>
cty US JP
tenor 1 3 5 1 3
0 0.211478 0.076928 -1.225755 0.080232 1.472201
1 0.159280 0.504315 0.741466 2.263926 0.771153
2 -0.759615 0.550016 -1.476229 1.838213 -0.509156
3 0.987656 0.238239 0.537588 -0.126640 0.252719
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3
tenor 1 3 5
0 2 2 1
1 2 2 1
2 2 2 1
3 2 2 1
cty JP US
tenor 1 3 1 3 5
0 1 1 1 1 1
1 1 1 1 1 1
2 1 1 1 1 1
3 1 1 1 1 1
[Finished in 1.2s]
二、數據聚合
這裏的數據聚合是說任何可以從數組產生標量值的過程。以前的例子已經用到了一些,好比mean()、count()、min()、max()等。常見的聚合運算都有就地計算數據集統計信息的優化實現。固然並不止這些,能夠用本身定義的運算,還能夠調用分組對象上已經定義好的任何方法。例如,quantile能夠計算Series或DataFrame列的樣本分位數。
#-*- encoding:utf-8 -*- import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt df =DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)}) #print df grouped = df.groupby('key1') #注意下面的quantile並無直接實現於GroupBy,它是一個Series方法,故而能用, #就是說,此過程其實是groupby對df進行高效切片,而後對每一個切片應用quantile print grouped['data1'].quantile(0.9),'\n' #對於本身定義的聚合函數,只需將其傳入aggregate或agg便可 #注意下面是對每列都應用 def peak_to_peak(arr): return arr.max() - arr.min() print grouped.agg(peak_to_peak),'\n' #有些方法(describe)也是能夠應用的。 print grouped.describe() #自定義函數比通過優化的函數要慢得多,這是由於在構造中間分組數據塊時存在很是大的開銷(函數調用、數據重排等) #下面說明更高級的聚合功能,用的是R語言reshape2包中的數據集tips,這數據是從R中本身導出來的 tips = pd.read_csv('E:\\tips.csv') #增長小費佔比一列 tips['tip_pct'] = tips['tip'] / tips['total_bill'] print tips.head()
>>>
key1
a 0.970028
b 0.642314
data1 data2
key1
a 1.502016 1.583056
b 0.495911 0.384405
data1 data2
key1
a count 3.000000 3.000000
mean 0.304136 0.822614
std 0.802148 0.792578
min -0.284158 0.007541
25% -0.152726 0.438623
50% -0.021293 0.869705
75% 0.598283 1.230151
max 1.217858 1.590597
b count 2.000000 2.000000
mean 0.443950 0.425535
std 0.350662 0.271816
min 0.195994 0.233332
25% 0.319972 0.329433
50% 0.443950 0.425535
75% 0.567927 0.521636
max 0.691905 0.617737
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female False Sun Dinner 2 0.059447
1 10.34 1.66 Male False Sun Dinner 3 0.160542
2 21.01 3.50 Male False Sun Dinner 3 0.166587
3 23.68 3.31 Male False Sun Dinner 2 0.139780
4 24.59 3.61 Female False Sun Dinner 4 0.146808
[Finished in 0.7s]
通過優化的GroupBy的方法,按做者的意思,這些函數是快的。
有時候須要對不一樣的列應用不一樣的函數,或者對一列應用不一樣的函數。下面是例子。
#-*- encoding:utf-8 -*- import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt #下面說明更高級的聚合功能,用的是R語言reshape2包中的數據集tips,這數據是從R中本身導出來的 tips = pd.read_csv('E:\\tips.csv') #增長小費佔比一列 tips['tip_pct'] = tips['tip'] / tips['total_bill'] print tips.head(),'\n' grouped = tips.groupby(['sex','smoker']) grouped_pct = grouped['tip_pct'] print grouped_pct.agg('mean'),'\n' #若傳入一組函數或函數名,獲得的DataFrame的列就會以相應的函數命名 def peak_to_peak(arr): return arr.max() - arr.min() #對比例這一列應用三個函數 print grouped_pct.agg(['mean','std',peak_to_peak]),'\n' #上面有個問題就是列名是自動給出的,以函數名爲列名,若傳入元組 #(name,function)組成的列表,就會自動將第一個元素做爲列名 print grouped_pct.agg([('foo','mean'),('bar',np.std)]),'\n' #注意np.std不能加引號 #還能夠對多列應用同一函數 functions = ['count','mean','max'] result = grouped['tip_pct','total_bill'].agg(functions) #對兩列都應用functions print result,'\n' #獲得的結果的列名是層次化索引,能夠直接用外層索引選取數據 print result['tip_pct'],'\n' ftuples = [('DDD','mean'),('AAA',np.var)] print grouped['tip_pct','total_bill'].agg(ftuples),'\n' #若是想對不一樣的列應用不一樣的函數,具體的辦法是向agg傳入一個從列映射到函數的字典 print grouped.agg({'tip':np.max,'size':sum}),'\n' #sum這樣的函數能夠加引號或者不加 print grouped.agg({'tip':['min','max','mean','std'],'size':sum})
>>>
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female False Sun Dinner 2 0.059447
1 10.34 1.66 Male False Sun Dinner 3 0.160542
2 21.01 3.50 Male False Sun Dinner 3 0.166587
3 23.68 3.31 Male False Sun Dinner 2 0.139780
4 24.59 3.61 Female False Sun Dinner 4 0.146808
sex smoker
Female False 0.156921
True 0.182150
Male False 0.160669
True 0.152771
Name: tip_pct
mean std peak_to_peak
sex smoker
Female False 0.156921 0.036421 0.195876
True 0.182150 0.071595 0.360233
Male False 0.160669 0.041849 0.220186
True 0.152771 0.090588 0.674707
foo bar
sex smoker
Female False 0.156921 0.036421
True 0.182150 0.071595
Male False 0.160669 0.041849
True 0.152771 0.090588
tip_pct total_bill
count mean max count mean max
sex smoker
Female False 54 0.156921 0.252672 54 18.105185 35.83
True 33 0.182150 0.416667 33 17.977879 44.30
Male False 97 0.160669 0.291990 97 19.791237 48.33
True 60 0.152771 0.710345 60 22.284500 50.81
count mean max
sex smoker
Female False 54 0.156921 0.252672
True 33 0.182150 0.416667
Male False 97 0.160669 0.291990
True 60 0.152771 0.710345
tip_pct total_bill
DDD AAA DDD AAA
sex smoker
Female False 0.156921 0.001327 18.105185 53.092422
True 0.182150 0.005126 17.977879 84.451517
Male False 0.160669 0.001751 19.791237 76.152961
True 0.152771 0.008206 22.284500 98.244673
size tip
sex smoker
Female False 140 5.2
True 74 6.5
Male False 263 9.0
True 150 10.0
tip size
min max mean std sum
sex smoker
Female False 1.00 5.2 2.773519 1.128425 140
True 1.00 6.5 2.931515 1.219916 74
Male False 1.25 9.0 3.113402 1.489559 263
True 1.00 10.0 3.051167 1.500120 150
[Finished in 0.7s]
到目前爲止,示例中的聚合數據都是由惟一的分組鍵組成的索引(可能仍是層次化的)。因爲並非總須要如此,能夠向groupby傳入as_index = False禁用該功能。
#-*- encoding:utf-8 -*- import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt tips = pd.read_csv('E:\\tips.csv') #增長小費佔比一列 tips['tip_pct'] = tips['tip'] / tips['total_bill'] print tips.head(),'\n'
print tips.groupby(['sex','smoker'],as_index = False).mean() #這裏的形式可能有時候更好用
>>>
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female False Sun Dinner 2 0.059447
1 10.34 1.66 Male False Sun Dinner 3 0.160542
2 21.01 3.50 Male False Sun Dinner 3 0.166587
3 23.68 3.31 Male False Sun Dinner 2 0.139780
4 24.59 3.61 Female False Sun Dinner 4 0.146808
sex smoker total_bill tip size tip_pct
0 Female False 18.105185 2.773519 2.592593 0.156921
1 Female True 17.977879 2.931515 2.242424 0.182150
2 Male False 19.791237 3.113402 2.711340 0.160669
3 Male True 22.284500 3.051167 2.500000 0.152771
[Finished in 0.6s]
三、分組級運算和轉換
聚合只是分組運算中的一種,它是數據轉換的一個特例。也就是說,它只是接受可以將一維數組簡化爲標量值的函數。本節將介紹transform和apply方法,可以執行更多其餘的分組運算。
#-*- encoding:utf-8 -*- import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt #下面爲DataFrame添加一個用於存放各索引分組平均值的列。一個辦法是先聚合在合併: df =DataFrame({'key1':list('aabba'),'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)}) #print df,'\n' k1_means = df.groupby('key1').mean().add_prefix('mean_') print k1_means,'\n' #下面用左邊的key1做爲鏈接鍵,right_index是將右邊的行索引做爲鏈接鍵 print pd.merge(df,k1_means,left_on = 'key1',right_index = True) #上面的方法雖然也行,可是不靈活。能夠看做利用mean函數對數據的兩列進行轉換。 people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis']) print people,'\n' key = ['one','two','one','two','one'] print people.groupby(key).mean(),'\n' #看下面神奇的事情 print people.groupby(key).transform(np.mean),'\n' #不難看出,transform會將一個函數應用到各個分組並將結果放置到適當的位置, #若是各分組產生的是一個標量值,則改值就會被廣播出去 #下面的例子很說明問題,很靈活 def demean(arr): return arr - arr.mean() demeaned = people.groupby(key).transform(demean) print demeaned,'\n' #下面檢查一下demeaned各組均值是否爲0 print demeaned.groupby(key).mean() >>> mean_data1 mean_data2 key1 a -0.729610 -0.141770 b -0.174505 0.484952 data1 data2 key1 key2 mean_data1 mean_data2 0 -2.082417 0.752055 a one -0.729610 -0.141770 1 -0.563339 -0.915167 a two -0.729610 -0.141770 4 0.456927 -0.262198 a one -0.729610 -0.141770 2 -0.173514 1.695344 b one -0.174505 0.484952 3 -0.175496 -0.725440 b two -0.174505 0.484952 a b c d e Joe -1.109408 -0.379178 -0.666847 2.003109 -1.331988 Steve 0.316630 -1.801337 -0.479510 0.305003 1.641795 Wes 0.338475 -0.613742 -0.623375 -0.423722 -0.529741 Jim 0.206591 -0.876095 0.297528 -0.177179 0.208701 Travis -1.307377 0.144524 0.236289 0.382082 0.497277 a b c d e one -0.69277 -0.282799 -0.351311 0.653823 -0.454817 two 0.26161 -1.338716 -0.090991 0.063912 0.925248 a b c d e Joe -0.69277 -0.282799 -0.351311 0.653823 -0.454817 Steve 0.26161 -1.338716 -0.090991 0.063912 0.925248 Wes -0.69277 -0.282799 -0.351311 0.653823 -0.454817 Jim 0.26161 -1.338716 -0.090991 0.063912 0.925248 Travis -0.69277 -0.282799 -0.351311 0.653823 -0.454817 a b c d e Joe -0.416638 -0.096379 -0.315536 1.349286 -0.877171 Steve 0.055020 -0.462621 -0.388519 0.241091 0.716547 Wes 1.031245 -0.330943 -0.272064 -1.077544 -0.074924 Jim -0.055020 0.462621 0.388519 -0.241091 -0.716547 Travis -0.614607 0.427322 0.587599 -0.271741 0.952094 a b c d e one 0 -1.850372e-17 -3.700743e-17 1.850372e-17 0 two 0 -5.551115e-17 0.000000e+00 0.000000e+00 0 [Finished in 0.7s]
本節就是說apply函數很重要,是最通常化的GroupBy方法。跟aggregate同樣,transform也是一個有着嚴格條件的特殊函數:傳入的函數只能產生兩種結果,要麼是能夠廣播的標量,要麼是產生一個相同大小的結果數組。apply函數將對象拆分爲多個片斷,對各個片斷調用傳入的函數,並嘗試將各片斷合到一塊兒。
#-*- encoding:utf-8 -*- import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt tips = pd.read_csv('E:\\tips.csv') #增長小費佔比一列 tips['tip_pct'] = tips['tip'] / tips['total_bill'] #print tips.head(),'\n' #下面找出指定列的最大的幾個值,而後將所在行選出來 def top(df,n = 5,column = 'tip_pct'): return df.sort_index(by = column)[-n:] print top(tips,n = 6),'\n' #若是對smoker分組並用該函數調用apply print tips.groupby('smoker').apply(top),'\n' #上面其實是在各個片斷上調用了top,而後用pd.concat進行了鏈接,並以分組名稱進行了標記,因而就造成了層次化索引 #固然能夠向top函數傳入參數 print tips.groupby(['smoker','day']).apply(top,n = 1,column = 'total_bill') #須要說明的是:apply很強大,須要發揮想象力,它只需返回一個pandas對象或者標量值便可 #以前曾經這麼作過: result = tips.groupby('smoker')['tip_pct'].describe() print result,'\n' print result.unstack('smoker'),'\n' #下面的方式,效果同樣 f = lambda x : x.describe() print tips.groupby('smoker')['tip_pct'].apply(f),'\n' #對全部列都行 print tips.groupby('smoker').apply(f),'\n' #看的出,上面自動生成了層次化索引,能夠將分組鍵去掉 print tips.groupby('smoker',group_keys = False).apply(top),'\n' #下面看得出,從新設置索引會去掉原來全部索引,並重置索引 print tips.groupby('smoker').apply(top).reset_index(drop = True),'\n' #下面看的出來,as_index在這裏並無論用 print tips.groupby('smoker',as_index = False).apply(top),'\n' #下面看的出來,as_index在這裏並無論用 print tips.groupby(['sex','smoker'],as_index = False).apply(top),'\n'
第七章中的cut和qcut函數能夠對數據進行拆分,如今將其與groupby集合起來會輕鬆實現對數據集的桶(bucket)(嗯,注意名字)或分位數(quantile)分析了。
#-*- encoding:utf-8 -*- import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt frame = DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)}) print frame.head(),'\n' factor = pd.cut(frame.data1,4) print factor[:10],'\n' #cut返回的對象可直接用於groupby(很合理) def get_stats(group): return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()} grouped = frame.data2.groupby(factor) print grouped.apply(get_stats),'\n' print grouped.apply(get_stats).unstack(),'\n' #上面的桶是區間大小相等的桶,要想獲得數據量相等的桶,用qcut便可。 grouping = pd.qcut(frame.data1,10) #print grouping #labels = False 標明只返回各個值所在的分組編號,而不是所在的各個分組,感受這樣更好 grouping = pd.qcut(frame.data1,10,labels = False) #print grouping,'\n' grouped = frame.data2.groupby(grouping) print grouped.apply(get_stats).unstack()