Pandas學習筆記系列:html
原文:https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/3-7-pd-merge/ 本文有刪減python
pandas
中的merge
和concat
相似,但主要是用於兩組有key column的數據,統一索引的數據. 一般也被用在Database的處理當中.git
import pandas as pd #定義資料集並打印出 left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) """ A B key 0 A0 B0 K0 1 A1 B1 K1 2 A2 B2 K2 3 A3 B3 K3 """ print(right) """ C D key 0 C0 D0 K0 1 C1 D1 K1 2 C2 D2 K2 3 C3 D3 K3 """ #依據key column合併,並打印出 res = pd.merge(left, right, on='key') print(res) """ A B key C D 0 A0 B0 K0 C0 D0 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 3 A3 B3 K3 C3 D3 """
合併時有4種方法how = ['left', 'right', 'outer', 'inner'],預設值how='inner'。github
import pandas as pd #定義資料集並打印出 left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) """ A B key1 key2 0 A0 B0 K0 K0 1 A1 B1 K0 K1 2 A2 B2 K1 K0 3 A3 B3 K2 K1 """ print(right) """ C D key1 key2 0 C0 D0 K0 K0 1 C1 D1 K1 K0 2 C2 D2 K1 K0 3 C3 D3 K2 K0 """
依據key1與key2 columns進行合併,並打印出四種結果['left', 'right', 'outer', 'inner']app
inner
表示若是兩個keys對應的value值相等,就交叉合併,不然丟棄,也就是求交集∩學習
例如left
和right
都有(K0,K0)
和(K1,K0)
。其中right
有兩個(K1,K0)
,left
只有一個,從下面代碼中的結果能夠看到具備相同key的值會交叉合併,因此此時會生成兩個新的(K1,K0)
值。spa
res = pd.merge(left, right, on=['key1', 'key2'], how='inner') print(res) """ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 """ # outter求並集∪ res = pd.merge(left, right, on=['key1', 'key2'], how='outer') print(res) """ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN 5 NaN NaN K2 K0 C3 D3 """ # 以左邊的key爲準,若是右邊和左邊的值相等 res = pd.merge(left, right, on=['key1', 'key2'], how='left') print(res) """ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN """ res = pd.merge(left, right, on=['key1', 'key2'], how='right') print(res) """ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 3 NaN NaN K2 K0 C3 D3 """
indicator=True會將合併的記錄放在新的一列。3d
import pandas as pd #定義資料集並打印出 df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) print(df1) """ col1 col_left 0 0 a 1 1 b """ print(df2) """ col1 col_right 0 1 2 1 2 2 2 2 2 """ # 依據col1進行合併,並啓用indicator=True,最後打印出 res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) print(res) """ col1 col_left col_right _merge 0 0.0 a NaN left_only 1 1.0 b 2.0 both 2 2.0 NaN 2.0 right_only 3 2.0 NaN 2.0 right_only """ # 自定indicator column的名稱,並打印出 res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') print(res) """ col1 col_left col_right indicator_column 0 0.0 a NaN left_only 1 1.0 b 2.0 both 2 2.0 NaN 2.0 right_only 3 2.0 NaN 2.0 right_only """
import pandas as pd #定義資料集並打印出 left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2']) right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], 'D': ['D0', 'D2', 'D3']}, index=['K0', 'K2', 'K3']) print(left) """ A B K0 A0 B0 K1 A1 B1 K2 A2 B2 """ print(right) """ C D K0 C0 D0 K2 C2 D2 K3 C3 D3 """ #依據左右資料集的index進行合併,how='outer',並打印出 res = pd.merge(left, right, left_index=True, right_index=True, how='outer') print(res) """ A B C D K0 A0 B0 C0 D0 K1 A1 B1 NaN NaN K2 A2 B2 C2 D2 K3 NaN NaN C3 D3 """ #依據左右資料集的index進行合併,how='inner',並打印出 res = pd.merge(left, right, left_index=True, right_index=True, how='inner') print(res) """ A B C D K0 A0 B0 C0 D0 K2 A2 B2 C2 D2 """
import pandas as pd #定義資料集 boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) #使用suffixes解決overlapping的問題 res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') print(res) """ age_boy k age_girl 0 1 K0 4 1 1 K0 5 """