I am trying to map column names in a dataset to names in a standard, the standard names are in a yaml file so you can change them if you need to. I was able to do an initial automated mapping, but I also need for the user to be able to choose incase the mapping was not done right, the columns they want to manually map themselves. so I was able to make the input after the initial mapping to let the user decide which columns to manually map and which options they have for the columns they selected.
The problem is that the the choice of the user is not overwriting the initial mapping so the columns are not changing to what the user wants it to be.
So I am using fuzzywuzzy to do the mapping, everything works correctly the only problem is in that the user input does not change the initial mapping.
import pandas as pd
import fuzzywuzzy.process as fuzz
import re
import yaml
try:
df = pd.read_excel('Mock Data_typos (2).xlsx')
except Exception as e:
print(f"Error loading data: {str(e)}")
df = None
if df is not None:
last_name_pattern = re.compile(r'\b[lL][aA][sS][tT]\s?[nN][aA][mM][eE]\b|\bLST\s?NM\b')
# Function to find and match columns
def match_columns(df, reference_columns, initial_score_threshold=70):
matched_columns = {} # Initialize the matched_columns dictionary
input_columns = df.columns.tolist()
for column in input_columns:
matches = fuzz.extractBests(column, reference_columns, score_cutoff=initial_score_threshold)
if matches:
matched_columns[column] = matches
return matched_columns
with open('standard_columns.yml', 'r') as yaml_file:
try:
reference_columns = yaml.safe_load(yaml_file)
except Exception as e:
print(f"Error loading reference columns: {str(e)}")
reference_columns = []
if reference_columns:
matched_columns = match_columns(df, reference_columns)
# Initial automated mapping
for index, (column, matches) in enumerate(matched_columns.items()):
if len(matches) > 0:
best_match = matches[0][0] # Get the full matched column name
df.rename(columns={column: best_match}, inplace=True)
print(f"{index}. Column '{column}' mapped to '{best_match}'")
print("Initial mapping finished.")
# Remove columns that are not in reference_columns
columns_to_remove = [col for col in df.columns if col not in reference_columns]
df.drop(columns=columns_to_remove, inplace=True)
# Check if "Last Name" doesn't exist and create it
if "Last Name" not in df.columns:
df["Last Name"] = ""
# Allow the user to specify columns for modification
change_columns_input = input("Enter a list of columns to modify (e.g., '0, 5, 7') or 'none' to skip: ")
if change_columns_input.lower() != 'none':
change_columns_list = [int(col.strip()) for col in change_columns_input.split(',')]
for column_index in change_columns_list:
if 0 <= column_index < len(matched_columns):
selected_column = list(matched_columns.keys())[column_index]
print(f"Mapping options for column {column_index}: '{selected_column}':")
for j, (match, score) in enumerate(matched_columns[selected_column]):
print(f" {j}. Map to '{match}' (Score: {score})") # Display the full match
while True:
match_choice = input("Enter the number for the mapping, or 'skip' to keep as is: ")
if match_choice.lower() == 'skip':
break
elif match_choice.isdigit():
match_index = int(match_choice)
if 0 <= match_index < len(matched_columns[selected_column]):
chosen_mapping = matched_columns[selected_column][match_index][0]
df.rename(columns={selected_column: chosen_mapping}, inplace=True)
print(f"Column {column_index}: '{selected_column}' has been mapped to '{chosen_mapping}'.")
break
else:
print("Invalid input. Please enter a valid number.")
else:
print("Invalid input. Please enter a valid number or 'skip'.")
print("Mapping and cleanup finished. Updated DataFrame:")
print(df)
else:
print("No reference columns loaded. Please check the reference columns file.")