I have a pandas dataframe with numerical as well as categorical columns. For any input row (to keep things simple we take any row from the orginal dataframe), I want to find the N most similar rows to it. However, instead of comparing the input row against the entire dataset I want to use a subset of the dataframe.
def find_similar_rows(dataset, query_df, input_row, top_n=10):
# Identify common columns
common_columns = list(set(dataset.columns) & set(input_row.index))
numeric_cols = dataset.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = dataset.select_dtypes(include=['object', 'category']).columns
_, num_imputer, cat_imputer, scaler, encoder = preprocess(dataset.copy(), numeric_cols, categorical_cols)
processed_query_df, _, _, _, _ = preprocess(query_df, numeric_cols, categorical_cols, num_imputer, cat_imputer, scaler, encoder)
processed_input_row, _, _, _, _ = preprocess(pd.DataFrame([input_row[common_columns]]), numeric_cols, categorical_cols, num_imputer, cat_imputer, scaler, encoder)
# Calculate cosine similarities
cosine_similarities = cosine_similarity(processed_input_row, processed_query_df)[0]
# Combine cosine similarity with DOMAIN_WEIGHTED_SUM and normalized RECHNUNGS_MENGE
combined_scores = cosine_similarities + processed_query_df['DOMAIN_WEIGHTED_SUM'].values / 100 + processed_query_df['RECHNUNGS_MENGE'].values / RECHNUNGS_MENGE_NORMALIZATION_FACTOR
# Get indices of top similar rows based on combined_scores
top_indices = np.argsort(combined_scores)[::-1][:top_n]
# Return both indices and scores
return top_indices, combined_scores[top_indices]
def preprocess(df, numeric_cols, categorical_cols, num_imputer=None, cat_imputer=None, scaler=None, encoder=None):
# Check if the imputers and scaler are provided, if not, create new ones
if num_imputer is None:
num_imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
else:
df[numeric_cols] = num_imputer.transform(df[numeric_cols])
if cat_imputer is None:
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
else:
df[categorical_cols] = cat_imputer.transform(df[categorical_cols])
if scaler is None:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
else:
df[numeric_cols] = scaler.transform(df[numeric_cols])
# One-hot encode categorical columns
if encoder is None:
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df[categorical_cols]).toarray()
else:
encoded_cats = encoder.transform(df[categorical_cols]).toarray()
encoded_cat_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
# Combine numeric and encoded categorical columns
combined_df = pd.concat([df[numeric_cols], encoded_cat_df], axis=1)
return combined_df, num_imputer, cat_imputer, scaler, encoder
top_indices, scores = find_similar_rows(dataset, query_df, row, top_n=NUM_SUGGESTED_DEVICES)
This works fine if query_df == dataset.
However, if I take a subset of the dataset (query_df = dataset.copy().query("PRODUCT_CATEGORY == @input_product_category")). The above code throws a "ValueError: Input contains NaN" at
cosine_similarities = cosine_similarity(processed_input_row, processed_query_df)[0]
Upon debugging I can see that the OneHotEncoder isn't transforming the query_df properly:
processed_query_df, _, _, _, _ = preprocess(query_df, numeric_cols, categorical_cols, num_imputer, cat_imputer, scaler, encoder)
produces this:
query_df after applying OneHotEncoding
Notice how the transformed categorical columns contain NaN values instead of 0s or 1s.
The input row doesn't contain any categorical values which the OneHotEncoder hasn't seen before (as it was fit on the entire dataset)
Could anyone please help me figure out what might be causing this? Any help would be appreciated!