Compare two dataframes for missing rows based on multiple columns python

Question

I want to compare two dataframes that have similar columns(not all) and print a new dataframe that shows the missing rows of df1 compare to df2 and a second dataframe that shows this time the missing values of df2 compare to df1 based on given columns.

Here the "key_columns" are named key_column1 and key_column2

import pandas as pd


data1 = {'first_column':  ['4', '2', '7', '2', '2'],
        'second_column': ['1', '2', '2', '2', '2'],
       'key_column1':['1', '3', '2', '6', '4'],
      'key_column2':['1', '2', '2', '1', '1'],
       'fourth_column':['1', '2', '2', '2', '2'],
         'other':['1', '2', '3', '2', '2'],
        }
df1 = pd.DataFrame(data1)

data2 = {'first':  ['1', '2', '2', '2', '2'],
        'second_column': ['1', '2', '2', '2', '2'],
       'key_column1':['1', '3', '2', '6', '4'],
      'key_column2':['1', '5', '2', '2', '2'],
       'fourth_column':['1', '2', '2', '2', '2'],
          'other2':['1', '4', '3', '2', '2'],
         'other3':['6', '8', '1', '4', '2'],
        }

df2 = pd.DataFrame(data2)

The column names of df1 and df2 are not the same, should 'first_column' in df1 be 'first'? Which columns should 'match' to say a row exists in both df1 and df2? — Emi OB
– Emi OB, Commented Sep 26, 2022 at 9:28
Yes first_column is the same as first I will edit the post thank you! key_column1 and key_column2 should "match". — yoopiyo
– yoopiyo, Commented Sep 26, 2022 at 9:55

score 1 · Accepted Answer · 2022-09-26 13:44:51Z

I have modified the data1 and data2 dictionaries so that the resulting dataframes have only same columns to demonstrate that the solution provided in the answer by Emi OB relies on existence of columns in one dataframe which are not in the other one ( in case a common column is used the code fails with KeyError on the column chosen to collect NaNs). Below an improved version which does not suffer from that limitation creating own columns for the purpose of collecting NaNs:

df1['df1_NaNs'] = '' # create additional column to collect NaNs
df2['df2_NaNs'] = '' # create additional column to collect NaNs

df1_s = df1.merge(df2[['key_column1', 'key_column2', 'df2_NaNs']], on=['key_column1', 'key_column2'], how='outer')
df2 = df2.drop(columns=["df2_NaNs"])   # clean up df2 
df1_s = df1_s.loc[df1_s['df2_NaNs'].isna(), df1.columns]
df1_s = df1_s.drop(columns=["df1_NaNs"]) # clean up df1_s 
print(df1_s)

print('--------------------------------------------')

df2_s = df2.merge(df1[['key_column1', 'key_column2', 'df1_NaNs']], on=['key_column1', 'key_column2'], how='outer')
df1 = df1.drop(columns=["df1_NaNs"])   # clean up df1 
df2_s = df2_s.loc[df2_s['df1_NaNs'].isna(), df2.columns]
df2_s = df2_s.drop(columns=["df2_NaNs"]) # clean up df2_s 
print(df2_s)

gives:

  first second_column key_column1 key_column2 fourth_column
1     2             2           3           2             2
3     2             2           6           1             2
4     2             2           4           1             2
--------------------------------------------
  first second_column key_column1 key_column2 fourth_column
1     2             2           3           5             3
3     2             2           6           2             5
4     2             2           4           2             6

Also the code below works in case the columns of both dataframes are the same and in addition saves memory and computation time by not creating temporary full-sized dataframes required to achieve the final result:

""" I want to compare two dataframes that have similar columns(not all) 
and print a new dataframe that shows the missing rows of df1 compare to 
df2 and a second dataframe that shows this time the missing values of 
df2 compare to df1 based on given columns. Here the "key_columns"
"""
import pandas as pd
#data1 ={  'first_column':['4', '2', '7', '2', '2'],
data1 = {         'first':['4', '2', '7', '2', '2'],
          'second_column':['1', '2', '2', '2', '2'],
            'key_column1':['1', '3', '2', '6', '4'],
            'key_column2':['1', '2', '2', '1', '1'],
          'fourth_column':['1', '2', '2', '2', '2'],
#                 'other':['1', '2', '3', '2', '2'],
        }
df1 = pd.DataFrame(data1)
#print(df1)

data2 = {  'first':['1', '2', '2', '2', '2'],
   'second_column':['1', '2', '2', '2', '2'],
     'key_column1':['1', '3', '2', '6', '4'],
     'key_column2':['1', '5', '2', '2', '2'],
#  'fourth_column':['1', '2', '2', '2', '2'],
   'fourth_column':['2', '3', '4', '5', '6'],
#         'other2':['1', '4', '3', '2', '2'],
#         'other3':['6', '8', '1', '4', '2'],
        }
df2 = pd.DataFrame(data2)
#print(df2)

data1_key_cols = dict.fromkeys( zip(data1['key_column1'], data1['key_column2']) )
data2_key_cols = dict.fromkeys( zip(data2['key_column1'], data2['key_column2']) )
# for Python versions < 3.7 (dictionaries are not ordered):
#data1_key_cols = list(zip(data1['key_column1'], data1['key_column2']))
#data2_key_cols = list(zip(data2['key_column1'], data2['key_column2']))
from collections import defaultdict
missing_data2_in_data1 = defaultdict(list)
missing_data1_in_data2 = defaultdict(list)

for indx, val in enumerate(data1_key_cols.keys()):
#for indx, val in enumerate(data1_key_cols): # for Python version < 3.7
    if val not in data2_key_cols:
        for key, val in data1.items():
            missing_data1_in_data2[key].append(data1[key][indx])
for indx, val in enumerate(data2_key_cols.keys()):
#for indx, val in enumerate(data2_key_cols): # for Python version < 3.7
    if val not in data1_key_cols:
        for key, val in data2.items():
            missing_data2_in_data1[key].append(data2[key][indx])
df1_s = pd.DataFrame(missing_data1_in_data2)
df2_s = pd.DataFrame(missing_data2_in_data1)
print(df1_s)
print('--------------------------------------------')
print(df2_s)

prints

  first second_column key_column1 key_column2 fourth_column
0     2             2           3           2             2
1     2             2           6           1             2
2     2             2           4           1             2
--------------------------------------------
  first second_column key_column1 key_column2 fourth_column
0     2             2           3           5             3
1     2             2           6           2             5
2     2             2           4           2             6

Thank you! The thing is that I work with the dataframes and they are not dictionaries. I created data1 and data2 for the example. So when I try this code on my dataframes it returns an error "KeyError: 15 --> 19 missing_data2_in_data1[key].append(df_actual_dep[key][indx])"
Can you provide a minimal example of the dataframes at which the above solution fails so that I can check if it can be improved to cover the case of missing data? Have you exported your dataframes to dictionaries for use with the code above?
See my updated answer for code working directly on dataframes without the disadvantage of depending on different columns in both dataframes.

Emi OB · Accepted Answer · 2022-09-26 10:21:02Z

If you outer merge on the 2 key columns, with an additional unique column in the second dataframe, that unique column will show Nan where the row is in the first dataframe but not the second. For example:

df2.merge(df1[['key_column1', 'key_column2', 'first_column']], on=['key_column1', 'key_column2'], how='outer')

gives:

  first second_column key_column1  ... other2 other3 first_column
0     1             1           1  ...      1      6            4
1     2             2           3  ...      4      8          NaN
2     2             2           2  ...      3      1            7
3     2             2           6  ...      2      4          NaN
4     2             2           4  ...      2      2          NaN
5   NaN           NaN           3  ...    NaN    NaN            2
6   NaN           NaN           6  ...    NaN    NaN            2
7   NaN           NaN           4  ...    NaN    NaN            2

Here the Nans in 'first_column' correspond to the rows in df2 that are not in df1. You can then use this fact with .loc[] to filter on those Nan rows, and only the columns in df2 like so:

df2_outer.loc[df2_outer['first_column'].isna(), df2.columns]

Output:

  first second_column key_column1 key_column2 fourth_column other2 other3
1     2             2           3           5             2      4      8
3     2             2           6           2             2      2      4
4     2             2           4           2             2      2      2

Full code for both tables is:

df2_outer = df2.merge(df1[['key_column1', 'key_column2', 'first_column']], on=['key_column1', 'key_column2'], how='outer')
print('missing values of df1 compare df2')
df2_output = df2_outer.loc[df2_outer['first_column'].isna(), df2.columns]
print(df2_output)

df1_outer = df1.merge(df2[['key_column1', 'key_column2', 'first']], on=['key_column1', 'key_column2'], how='outer')
print('missing values of df2 compare df1')
df1_output = df1_outer.loc[df1_outer['first'].isna(), df1.columns]
print(df1_output)

Which outputs:

missing values of df1 compare df2
  first second_column key_column1 key_column2 fourth_column other2 other3
1     2             2           3           5             2      4      8
3     2             2           6           2             2      2      4
4     2             2           4           2             2      2      2
missing values of df2 compare df1
  first_column second_column key_column1 key_column2 fourth_column other
1            2             2           3           2             2     2
3            2             2           6           1             2     2
4            2             2           4           1             2     2

Thank you very much but the added column is not a unique one I added it for the example. I have multiple columns in df1 and df2 that are not the same. I just want to be sure to compare them with the two "key columns"

Collectives™ on Stack Overflow

Compare two dataframes for missing rows based on multiple columns python

2 Answers 2

3 Comments

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

3 Comments

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related