通过key合并
import pandas as pd
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3
C D key
0 C0 D0 K0
1 C1 D1 K1
2 C2 D2 K2
3 C3 D3 K3
# 通过key合并
res = pd.merge(left,right,on = 'key')
print(res)
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
通过多个keys合并
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
# 通过多个keys合并
res = pd.merge(left,right,on = ['key1','key2']) # merge默认how='inner'
print(res) #right中K1、KO对应有不同的两行,与left合并时生成两行
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
how = ['left','right','outer','inner']
# how = ’left‘
res = pd.merge(left,right,on = ['key1','key2'],how = 'left')
print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
# how = ’right‘
res = pd.merge(left,right,on = ['key1','key2'],how = 'right')
print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
# how = ’outer‘
res = pd.merge(left,right,on = ['key1','key2'],how = 'outer')
print(res)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
indicator:显示merge的方式
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
col1 col_left
0 0 a
1 1 b
col1 col_right
0 1 2
1 2 2
2 2 2
# indicator=True
res = pd.merge(df1,df2,on='col1',how='outer',indicator=True)
print(res)
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
# 自定义indicator名称 give the indicator a custom name
res = pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')
print(res)
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
通过index合并
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index=['K0','K2','K3'])
print(left)
print(right)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
# left_index & right_index
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(res)
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
处理重合数据
boys = pd.DataFrame({'K':['K0','K1','K2'],
'age':[1,2,3]})
girls = pd.DataFrame({'K':['K0','K0','K3'],
'age':[4,5,6]})
print(boys)
print(girls)
K age
0 K0 1
1 K1 2
2 K2 3
K age
0 K0 4
1 K0 5
2 K3 6
# how='inner'
res = pd.merge(boys,girls,on='K',suffixes=['_boys','_girls'],how='inner')
print(res)
K age_boys age_girls
0 K0 1 4
1 K0 1 5
#how='outer'
res = pd.merge(boys,girls,on='K',suffixes=['_boys','_girls'],how='outer')
print(res)
K age_boys age_girls
0 K0 1.0 4.0
1 K0 1.0 5.0
2 K1 2.0 NaN
3 K2 3.0 NaN
4 K3 NaN 6.0