Overwrite Initial automated mapping

30 Views Asked by At

I am trying to map column names in a dataset to names in a standard, the standard names are in a yaml file so you can change them if you need to. I was able to do an initial automated mapping, but I also need for the user to be able to choose incase the mapping was not done right, the columns they want to manually map themselves. so I was able to make the input after the initial mapping to let the user decide which columns to manually map and which options they have for the columns they selected.

The problem is that the the choice of the user is not overwriting the initial mapping so the columns are not changing to what the user wants it to be.

So I am using fuzzywuzzy to do the mapping, everything works correctly the only problem is in that the user input does not change the initial mapping.

import pandas as pd
import fuzzywuzzy.process as fuzz
import re
import yaml

try:
    df = pd.read_excel('Mock Data_typos (2).xlsx')
except Exception as e:
    print(f"Error loading data: {str(e)}")
    df = None

if df is not None:
    last_name_pattern = re.compile(r'\b[lL][aA][sS][tT]\s?[nN][aA][mM][eE]\b|\bLST\s?NM\b')

    # Function to find and match columns
    def match_columns(df, reference_columns, initial_score_threshold=70):
        matched_columns = {}  # Initialize the matched_columns dictionary

        input_columns = df.columns.tolist()

        for column in input_columns:
            matches = fuzz.extractBests(column, reference_columns, score_cutoff=initial_score_threshold)
            if matches:
                matched_columns[column] = matches

        return matched_columns

    with open('standard_columns.yml', 'r') as yaml_file:
        try:
            reference_columns = yaml.safe_load(yaml_file)
        except Exception as e:
            print(f"Error loading reference columns: {str(e)}")
            reference_columns = []

    if reference_columns:
        matched_columns = match_columns(df, reference_columns)

        # Initial automated mapping
        for index, (column, matches) in enumerate(matched_columns.items()):
            if len(matches) > 0:
                best_match = matches[0][0]  # Get the full matched column name
                df.rename(columns={column: best_match}, inplace=True)
                print(f"{index}. Column '{column}' mapped to '{best_match}'")

        print("Initial mapping finished.")

        # Remove columns that are not in reference_columns
        columns_to_remove = [col for col in df.columns if col not in reference_columns]
        df.drop(columns=columns_to_remove, inplace=True)

        # Check if "Last Name" doesn't exist and create it
        if "Last Name" not in df.columns:
            df["Last Name"] = ""

        # Allow the user to specify columns for modification
        change_columns_input = input("Enter a list of columns to modify (e.g., '0, 5, 7') or 'none' to skip: ")

        if change_columns_input.lower() != 'none':
            change_columns_list = [int(col.strip()) for col in change_columns_input.split(',')]

            for column_index in change_columns_list:
                if 0 <= column_index < len(matched_columns):
                    selected_column = list(matched_columns.keys())[column_index]

                    print(f"Mapping options for column {column_index}: '{selected_column}':")
                    for j, (match, score) in enumerate(matched_columns[selected_column]):
                        print(f"  {j}. Map to '{match}' (Score: {score})")  # Display the full match

                    while True:
                        match_choice = input("Enter the number for the mapping, or 'skip' to keep as is: ")
                        if match_choice.lower() == 'skip':
                            break
                        elif match_choice.isdigit():
                            match_index = int(match_choice)
                            if 0 <= match_index < len(matched_columns[selected_column]):
                                chosen_mapping = matched_columns[selected_column][match_index][0]
                                df.rename(columns={selected_column: chosen_mapping}, inplace=True)
                                print(f"Column {column_index}: '{selected_column}' has been mapped to '{chosen_mapping}'.")
                                break
                            else:
                                print("Invalid input. Please enter a valid number.")
                        else:
                            print("Invalid input. Please enter a valid number or 'skip'.")

        print("Mapping and cleanup finished. Updated DataFrame:")
        print(df)

    else:
        print("No reference columns loaded. Please check the reference columns file.")
0

There are 0 best solutions below