I run NMF with model configuration and iterative optimization. I got a reconstruction error of 0.36 (36% error) to the original data. My question is how well is the reconstruction? Is there any "acceptable" range for relative error (or any error metric) in the context of Non-negative Matrix Factorization (NMF) or any machine learning task?
I have tried the following code and I was expecting less than 0.1 reconstruction error (10% error). The code I used is as follows:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
# Define the working directory
os.chdir(os.path.expanduser("~/Desktop"))
# Load the CSV data
data = pd.read_csv("abc")
def fit_and_evaluate_nmf(train_data, test_data, model):
W_train = model.fit_transform(train_data)
H_train = model.components_
W_test = model.transform(test_data)
reconstructed_test = np.dot(W_test, H_train)
error = np.linalg.norm(test_data - reconstructed_test, 'fro')
return W_train, error
def extract_top_variables(H_consensus, num_top_variables, data_columns):
num_original_variables = len(data_columns)
top_variables_df = pd.DataFrame(columns=['Component', 'Variable', 'Score'])
for component_idx, component_scores in enumerate(H_consensus):
sorted_indices = np.argsort(component_scores)[::-1]
top_variable_indices = sorted_indices[:num_top_variables]
for variable_idx in top_variable_indices:
original_variable_idx = variable_idx % num_original_variables
top_variables_df = top_variables_df.append({
'Component': component_idx + 1,
'Variable': data_columns[original_variable_idx],
'Score': component_scores[variable_idx]
}, ignore_index=True)
return top_variables_df
# Configuration & Initialization
n_splits = 10
R = 100
seed = 0
K_range = list(range(2, 31))
num_top_variables_range = list(range(1, 25))
solvers = ['cd', 'mu']
init_methods = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']
W_list = []
errors = []
# Cross-validation and NMF fitting
for run in range(R):
kf = KFold(n_splits=n_splits, shuffle=True, random_state=run)
W_fold_list = []
for train_index, test_index in kf.split(data):
for solver in solvers:
for init_method in init_methods:
nmf_model = NMF(n_components=max(K_range), solver=solver, init=init_method, random_state=seed, max_iter=1000, tol=1e-4, alpha=0.1, l1_ratio=0.5)
W_train, error = fit_and_evaluate_nmf(data.iloc[train_index], data.iloc[test_index], nmf_model)
errors.append(error)
W_fold_list.append(W_train)
W_list.append(np.concatenate(W_fold_list, axis=0))
mean_error = np.mean(errors)
print(f"Mean Reconstruction Error (Frobenius norm) over all runs and folds: {mean_error:.4f}")
W_aggregate = np.concatenate(W_list, axis=1)
# NMF Optimization over components and variables
best_nmf_model = None
best_variability_explained = 0.0
best_num_components = 0
best_num_variables_per_component = 0
for K in K_range:
for num_top_variables in num_top_variables_range:
for solver in solvers:
for init_method in init_methods:
final_nmf_model = NMF(n_components=K, solver=solver, init=init_method, random_state=seed, max_iter=500)
W_consensus = final_nmf_model.fit_transform(W_aggregate)
H_consensus = final_nmf_model.components_
variability_explained = np.sum(H_consensus[:, :num_top_variables])
if variability_explained > best_variability_explained:
best_nmf_model = final_nmf_model
best_variability_explained = variability_explained
best_num_components = K
best_num_variables_per_component = num_top_variables
# Evaluate the best model on original data
W_consensus = best_nmf_model.fit_transform(data)
H_consensus = best_nmf_model.components_
reconstructed = np.dot(W_consensus, H_consensus)
relative_error = np.linalg.norm(data - reconstructed) / np.linalg.norm(data)
print("Relative error:", relative_error)```