| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- # View more python tutorials on my Youtube and Youku channel!!!
- # Youtube video tutorial: https://www.youtube.com/channel/UCdyjiB5H8Pu7aDTNVXTTpcg
- # Youku video tutorial: http://i.youku.com/pythontutorial
- """
- Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
- """
- from __future__ import print_function
- import pandas as pd
- # merging two df by key/keys. (may be used in database)
- # simple example
- left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
- 'A': ['A0', 'A1', 'A2', 'A3'],
- 'B': ['B0', 'B1', 'B2', 'B3']})
- right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
- 'C': ['C0', 'C1', 'C2', 'C3'],
- 'D': ['D0', 'D1', 'D2', 'D3']})
- print(left)
- print(right)
- res = pd.merge(left, right, on='key')
- print(res)
- # consider two keys
- left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
- 'key2': ['K0', 'K1', 'K0', 'K1'],
- 'A': ['A0', 'A1', 'A2', 'A3'],
- 'B': ['B0', 'B1', 'B2', 'B3']})
- right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
- 'key2': ['K0', 'K0', 'K0', 'K0'],
- 'C': ['C0', 'C1', 'C2', 'C3'],
- 'D': ['D0', 'D1', 'D2', 'D3']})
- print(left)
- print(right)
- res = pd.merge(left, right, on=['key1', 'key2'], how='inner') # default for how='inner'
- # how = ['left', 'right', 'outer', 'inner']
- res = pd.merge(left, right, on=['key1', 'key2'], how='left')
- print(res)
- # indicator
- df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
- df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
- print(df1)
- print(df2)
- res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
- # give the indicator a custom name
- res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
- # merged by index
- left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
- 'B': ['B0', 'B1', 'B2']},
- index=['K0', 'K1', 'K2'])
- right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
- 'D': ['D0', 'D2', 'D3']},
- index=['K0', 'K2', 'K3'])
- print(left)
- print(right)
- # left_index and right_index
- res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
- res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
- # handle overlapping
- boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
- girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
- res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
- print(res)
- # join function in pandas is similar with merge. If know merge, you will understand join
|