pandas 使用總結

時間 2019-11-11

標籤 pandas 使用總結简体版

原文原文鏈接

import pandas as pd
import numpy as np

## 從字典初始化df
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
         'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
print(df)

Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
2   Devils     2  2014     863
3   Devils     3  2015     673
4    Kings     3  2014     741
5    Kings     4  2015     812
6    Kings     1  2016     756
7    Kings     1  2017     788
8   Riders     2  2016     694
9   Royals     4  2014     701
10  Royals     1  2015     804
11  Riders     2  2017     690

print(df.groupby('Team')) ##　groupby 返回的對象

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fcbff80a240>

print(df.groupby('Team').groups) ##用groups屬性來進行查看每一個分組

{'Devils': Int64Index([2, 3], dtype='int64'), 'Kings': Int64Index([4, 5, 6, 7], dtype='int64'), 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'), 'Royals': Int64Index([9, 10], dtype='int64')}

## 對groupby 後的結果進行遍歷
grouped = df.groupby('Year')
for name,group in grouped:
    print(name)
    print(group)

2014
     Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701
2015
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    Kings     4  2015     812
10  Royals     1  2015     804
2016
     Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694
2017
      Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690

## 從多個groups中獲取單個group
grouped = df.groupby('Year')
print(grouped.get_group(2014))

Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701

## 使用agg聚合函數計算均值
grouped = df.groupby('Year')
print(grouped['Points'].agg('mean'))

Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64

## 使用agg聚合函數計算數據條數
grouped = df.groupby('Team')
print(grouped.agg(np.size))

Rank  Year  Points
Team                      
Devils     2     2       2
Kings      4     4       4
Riders     4     4       4
Royals     2     2       2

## 使用多個agg聚合函數進行計算
grouped = df.groupby('Team')
print(grouped.agg([np.sum, np.mean, np.std]))
print(grouped['Points'].agg([np.sum, np.mean, np.std]))
print(grouped['Points'].agg({'Points':[np.sum, np.mean, np.std],'Rank':[np.mean]}))  ## 分別指定不一樣的聚合函數

Rank                  Year                   Points                    
        sum  mean       std   sum    mean       std    sum    mean         std
Team                                                                          
Devils    5  2.50  0.707107  4029  2014.5  0.707107   1536  768.00  134.350288
Kings     9  2.25  1.500000  8062  2015.5  1.290994   3097  774.25   31.899582
Riders    7  1.75  0.500000  8062  2015.5  1.290994   3049  762.25   88.567771
Royals    5  2.50  2.121320  4029  2014.5  0.707107   1505  752.50   72.831998
         sum    mean         std
Team                            
Devils  1536  768.00  134.350288
Kings   3097  774.25   31.899582
Riders  3049  762.25   88.567771
Royals  1505  752.50   72.831998
       Points                        Rank
          sum    mean         std    mean
Team                                     
Devils   1536  768.00  134.350288  768.00
Kings    3097  774.25   31.899582  774.25
Riders   3049  762.25   88.567771  762.25
Royals   1505  752.50   72.831998  752.50


/home/disk1/data/tangshengyu_dxm/tools/env_py36/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """

## grouped數據從新生成dataframe
print(df.groupby('Year')['Team'].apply(len).reset_index())   ## 一級列名
print(df.groupby('Year')['Team'].apply(len).to_frame())      ## 多級列名，列變爲索引

Year  Team
0  2014     4
1  2015     4
2  2016     2
3  2017     2
      Team
Year      
2014     4
2015     4
2016     2
2017     2

## 更改聚合後的列名
grouped_df = grouped.agg({'Points':['min','max','mean']})
print(grouped_df.columns)
print(grouped_df.columns.values)
grouped_df.columns = ['_'.join(col_tuple) for col_tuple in grouped_df.columns.values]
grouped_df.reset_index()

MultiIndex(levels=[['Points'], ['min', 'max', 'mean']],
           labels=[[0, 0, 0], [0, 1, 2]])
[('Points', 'min') ('Points', 'max') ('Points', 'mean')]

	Team	Points_min	Points_max	Points_mean
0	Devils	673	863	768.00
1	Kings	741	812	774.25
2	Riders	690	876	762.25
3	Royals	701	804	752.50

## group 後的數據進行transform
grouped = df.groupby('Team')
score = lambda x: (x - x.mean()) 
print(grouped.transform(score))

Rank  Year  Points
0  -0.75  -1.5  113.75
1   0.25  -0.5   26.75
2  -0.50  -0.5   95.00
3   0.50   0.5  -95.00
4   0.75  -1.5  -33.25
5   1.75  -0.5   37.75
6  -1.25   0.5  -18.25
7  -1.25   1.5   13.75
8   0.25   0.5  -68.25
9   1.50  -0.5  -51.50
10 -1.50   0.5   51.50
11  0.25   1.5  -72.25

## filter 過濾 (返回知足條件的)
grouped = df.groupby('Team')
print(grouped.filter(lambda x: len(x)>3))

Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
4    Kings     3  2014     741
5    Kings     4  2015     812
6    Kings     1  2016     756
7    Kings     1  2017     788
8   Riders     2  2016     694
11  Riders     2  2017     690

## 每一個分組的數據量
grouped = df.groupby('Team')
print(grouped.apply(lambda x: len(x)))
print(type(grouped.apply(lambda x: len(x))))

Team
Devils    2
Kings     4
Riders    4
Royals    2
dtype: int64
<class 'pandas.core.series.Series'>

## 多行字符串組合成一行
print(df)
df_grouped = df.groupby(['Year'])['Team'].apply(';'.join).reset_index()
print(df_grouped)

Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
2   Devils     2  2014     863
3   Devils     3  2015     673
4    Kings     3  2014     741
5    Kings     4  2015     812
6    Kings     1  2016     756
7    Kings     1  2017     788
8   Riders     2  2016     694
9   Royals     4  2014     701
10  Royals     1  2015     804
11  Riders     2  2017     690
   Year                        Team
0  2014  Riders;Devils;Kings;Royals
1  2015  Riders;Devils;Kings;Royals
2  2016                Kings;Riders
3  2017                Kings;Riders

## 一行變多行
def explode(df,tar_col_name):
    tar_col_list = [tar_col_name]
    rem_col_list = df.columns.difference(tar_col_list)
    rem_col_list = list(rem_col_list)
    df_new = df.set_index(rem_col_list)
    df_explode = pd.DataFrame(df_new[tar_col_name].tolist(),index=df_new.index)
    df_explode = df_explode.stack().to_frame()
    df_explode.columns = tar_col_list
    df_explode = df_explode.reset_index(level= rem_col_list)
    return df_explode

df_grouped['Team'] = df_grouped['Team'].apply(lambda s:s.split(';')) ## 先split獲得list
print(df_grouped)
explode(df_grouped,'Team')

Year                             Team
0  2014  [Riders, Devils, Kings, Royals]
1  2015  [Riders, Devils, Kings, Royals]
2  2016                  [Kings, Riders]
3  2017                  [Kings, Riders]

	Year	Team
0	2014	Riders
1	2014	Devils
2	2014	Kings
3	2014	Royals
0	2015	Riders
1	2015	Devils
2	2015	Kings
3	2015	Royals
0	2016	Kings
1	2016	Riders
0	2017	Kings
1	2017	Riders

# 將多列合併成一列
data = [['Alex', 10, 150], ['Bob',  12, 153], ['Clarke', 13, 160], ['Tom', 12, 160]]
df = pd.DataFrame(data, columns=['Name', 'Age', 'Stature'])
print(df)
df_new = df['Age'].astype(str) +'-'+ df['Stature'].astype(str)
print(df_new)

Name  Age  Stature
0    Alex   10      150
1     Bob   12      153
2  Clarke   13      160
3     Tom   12      160
0    10-150
1    12-153
2    13-160
3    12-160
dtype: object

## 一列拆分紅多列
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
         'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
df_grouped = df.groupby(['Year'])['Team'].apply(';'.join).reset_index()
print(df_grouped)
df_grouped['Team'].str.split(';', expand=True)

Year                        Team
0  2014  Riders;Devils;Kings;Royals
1  2015  Riders;Devils;Kings;Royals
2  2016                Kings;Riders
3  2017                Kings;Riders

	0	1	2	3
0	Riders	Devils	Kings	Royals
1	Riders	Devils	Kings	Royals
2	Kings	Riders	None	None
3	Kings	Riders	None	None

def df2libsvm(df,missing_value='-9999'):
    re_list=[]
    length=len(df)
    for i in range(length):
        row_i=df.iloc[i]
        row_dict=row_i.to_dict()
        row_list=[]
        for key in row_dict:
            if row_dict[key]==missing_value:
                continue
            row_list.append('%s:%s'%(key,str(row_dict[key])))
        re_list.append(row_list)
    return re_list

def libsvm2df():
    """
    mydict = [{'b': 2, 'c': 3, 'd': 4},
...           {'a': 100, 'c': 300, 'd': 400},
...           {'a': 1000, 'b': 2000, 'c': 3000}]
    df=pd.DataFrame(mydict)
    """

def calcu_iv(df,feat_col,label_col,good,bad):
    import numpy as np
    def f(x,label_col,good,bad):
        d = {}
        d['bin_bad_cnt'] = (x[label_col]==bad).sum()
        d['bin_good_cnt'] = (x[label_col]==good).sum()
        return pd.Series(d, index=['bin_good_cnt', 'bin_bad_cnt'])

    df_woe = df.groupby(feat_col).apply(f,label_col=label_col,good=good,bad=bad).reset_index()
    all_good_cnt = df_woe.bin_good_cnt.sum()
    all_bad_cnt = df_woe.bin_bad_cnt.sum()
    if all_bad_cnt==0:
        all_bad_cnt=1
    if all_good_cnt==0:
        all_good_cnt=1
    df_woe = df_woe.replace({'bin_bad_cnt': {0: 0.1}})
    df_woe = df_woe.replace({'bin_good_cnt': {0: 0.1}})
    df_woe['distribution_good'] = df_woe['bin_good_cnt']/float(all_good_cnt)
    df_woe['distribution_bad'] = df_woe['bin_bad_cnt']/float(all_bad_cnt)
    df_woe['WoE'] = np.log(df_woe['distribution_good']/df_woe['distribution_bad'])
    df_woe['IV'] = df_woe['WoE'] * (df_woe['distribution_good'] - df_woe['distribution_bad'])
    df_woe_inf = df_woe[df_woe['WoE']==np.inf]
    iv = df_woe['IV'].sum()
    return iv,df_woe

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。