How to apply the sklearn OneHotEncoder to a subset of rows in a Pandas Dataframe?

47 Views Asked by At

I have a pandas dataframe with numerical as well as categorical columns. For any input row (to keep things simple we take any row from the orginal dataframe), I want to find the N most similar rows to it. However, instead of comparing the input row against the entire dataset I want to use a subset of the dataframe.

def find_similar_rows(dataset, query_df, input_row, top_n=10):
    # Identify common columns
    common_columns = list(set(dataset.columns) & set(input_row.index))

    numeric_cols = dataset.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = dataset.select_dtypes(include=['object', 'category']).columns

    _, num_imputer, cat_imputer, scaler, encoder = preprocess(dataset.copy(), numeric_cols, categorical_cols)
    processed_query_df, _, _, _, _ = preprocess(query_df, numeric_cols, categorical_cols, num_imputer, cat_imputer, scaler, encoder)
    processed_input_row, _, _, _, _ = preprocess(pd.DataFrame([input_row[common_columns]]), numeric_cols, categorical_cols, num_imputer, cat_imputer, scaler, encoder)

    # Calculate cosine similarities
    cosine_similarities = cosine_similarity(processed_input_row, processed_query_df)[0]

    # Combine cosine similarity with DOMAIN_WEIGHTED_SUM and normalized RECHNUNGS_MENGE
    combined_scores = cosine_similarities + processed_query_df['DOMAIN_WEIGHTED_SUM'].values / 100 + processed_query_df['RECHNUNGS_MENGE'].values / RECHNUNGS_MENGE_NORMALIZATION_FACTOR

    # Get indices of top similar rows based on combined_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_n]

    # Return both indices and scores
    return top_indices, combined_scores[top_indices]


def preprocess(df, numeric_cols, categorical_cols, num_imputer=None, cat_imputer=None, scaler=None, encoder=None):
    # Check if the imputers and scaler are provided, if not, create new ones
    if num_imputer is None:
        num_imputer = SimpleImputer(strategy='mean')
        df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    else:
        df[numeric_cols] = num_imputer.transform(df[numeric_cols])

    if cat_imputer is None:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    else:
        df[categorical_cols] = cat_imputer.transform(df[categorical_cols])

    if scaler is None:
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    else:
        df[numeric_cols] = scaler.transform(df[numeric_cols])

    # One-hot encode categorical columns
    if encoder is None:
        encoder = OneHotEncoder(handle_unknown='ignore')
        encoded_cats = encoder.fit_transform(df[categorical_cols]).toarray()
    else:
        encoded_cats = encoder.transform(df[categorical_cols]).toarray()

    encoded_cat_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

    # Combine numeric and encoded categorical columns
    combined_df = pd.concat([df[numeric_cols], encoded_cat_df], axis=1)

    return combined_df, num_imputer, cat_imputer, scaler, encoder


top_indices, scores = find_similar_rows(dataset, query_df, row, top_n=NUM_SUGGESTED_DEVICES)

This works fine if query_df == dataset. However, if I take a subset of the dataset (query_df = dataset.copy().query("PRODUCT_CATEGORY == @input_product_category")). The above code throws a "ValueError: Input contains NaN" at

cosine_similarities = cosine_similarity(processed_input_row, processed_query_df)[0]
 

Upon debugging I can see that the OneHotEncoder isn't transforming the query_df properly:

processed_query_df, _, _, _, _ = preprocess(query_df, numeric_cols, categorical_cols, num_imputer, cat_imputer, scaler, encoder)

produces this:

query_df after applying OneHotEncoding

Notice how the transformed categorical columns contain NaN values instead of 0s or 1s.

The input row doesn't contain any categorical values which the OneHotEncoder hasn't seen before (as it was fit on the entire dataset)

Could anyone please help me figure out what might be causing this? Any help would be appreciated!

0

There are 0 best solutions below