I'm looking for a method to concatenate multiple columns into a new one for duplicate rows with pandas. Here is the code I have so far :
import pandas as pd
df = pd.DataFrame([
{"col1": 1, "col2": 2, "col3": 3, "col4": 4, "col5": 5},
{"col1": 1, "col2": 2, "col3": 3, "col4": 6, "col5": 7},
{"col1": 1, "col2": 2, "col3": 3, "col4": 8, "col5": 8},
{"col1": 1, "col2": 2, "col3": 10, "col4": 100, "col5": 101},
{"col1": 1, "col2": 2, "col3": 10, "col4": 100, "col5": 102},
{"col1": 1, "col2": 2, "col3": 10, "col4": 100, "col5": 100},
])
def f(x, y, z):
return list({x, y, z})
new_df_rows = []
# !!!
# should merge all rows of same duplicated(col1, col2, col3) into one row with a new field containing the set of all values from "col3", "col4" and "col5"
# the following works but I'm using a lot of hacks
# !!!
df_duplicated = df[df.duplicated(["col1", "col2", "col3"], keep=False)]
df_duplicated_groupby = df_duplicated.groupby(["col1", "col2", "col3"])
group_names = df_duplicated_groupby.groups.keys()
for group_name in group_names:
group = df_duplicated_groupby.get_group(group_name)
print(group_name)
print(group)
new_col = list({x for l in [f(row[0], row[1], row[2]) for row in group[['col3', "col4",'col5']].to_numpy()] for x in l})
new_df_rows.append({
"col1": group_name[0],
"col2": group_name[1],
"col3": group_name[2],
"new_col": new_col
})
new_df = pd.DataFrame(new_df_rows)
print(new_df.to_string())
Expected results :
col1 col2 col3 new_col
0 1 2 3 [3, 4, 5, 6, 7, 8]
1 1 2 10 [10, 100, 101, 102]
Is there a more concise and/or faster method with pandas to achieve that ?