What I'm trying to do is find and correct similar names in my database, like 'Patrick Maxwell' and 'Patrick Maxwel.' However, the issue I'm facing is that the best match for each name is often itself, even if it's spelled slightly differently, like 'Patrick Maxwel.' This doesn't help me consolidate the names into a single correct version.
def create_corrected_dict(names_list, threshold=90):
# Filters valid names and removes unwanted characters
filtered_names = [name for name in names_list if name.strip() and not any(char in name for char in ['/','\\','[',']','~'])]
# Pre-calculates the fuzzy correspondences
fuzzy_matches = {}
for name in filtered_names:
match = process.extractOne(name, filtered_names, scorer=fuzz.token_sort_ratio)
if match and match[1] > threshold and match[0] != name:
fuzzy_matches[name] = match[0]
corrected_dict = {}
for name in names_list:
cleaned_name = name.strip()
if not cleaned_name or any(char in cleaned_name for char in ['/','\\','[',']','~']):
corrected_dict[name] = name
elif cleaned_name in fuzzy_matches:
corrected_dict[name] = fuzzy_matches[cleaned_name]
else:
corrected_dict[name] = cleaned_name
return corrected_dict
# Create a correction dictionary
unique_names = df_resultante['names'].unique()
dicionario_corrigido = create_corrected_dict(unique_names)
# Applying name correction
corrected_names = create_corrected_dict(df_resultante['names'].tolist())
df_resultante['Colaborador'] = df_resultante['Colaborador'].map(corrected_names)