Is there any acceptable range for NMF reconstruction error?

68 Views Asked by At

I run NMF with model configuration and iterative optimization. I got a reconstruction error of 0.36 (36% error) to the original data. My question is how well is the reconstruction? Is there any "acceptable" range for relative error (or any error metric) in the context of Non-negative Matrix Factorization (NMF) or any machine learning task?

I have tried the following code and I was expecting less than 0.1 reconstruction error (10% error). The code I used is as follows:

import os
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

 
# Define the working directory
os.chdir(os.path.expanduser("~/Desktop"))


# Load the CSV data
data = pd.read_csv("abc")


def fit_and_evaluate_nmf(train_data, test_data, model):
    W_train = model.fit_transform(train_data)
    H_train = model.components_
    
    W_test = model.transform(test_data)
    reconstructed_test = np.dot(W_test, H_train)
    
    error = np.linalg.norm(test_data - reconstructed_test, 'fro')
    
    return W_train, error


def extract_top_variables(H_consensus, num_top_variables, data_columns):
    num_original_variables = len(data_columns)
    top_variables_df = pd.DataFrame(columns=['Component', 'Variable', 'Score'])

    for component_idx, component_scores in enumerate(H_consensus):
        sorted_indices = np.argsort(component_scores)[::-1]
        top_variable_indices = sorted_indices[:num_top_variables]
        
        for variable_idx in top_variable_indices:
            original_variable_idx = variable_idx % num_original_variables
            top_variables_df = top_variables_df.append({
                'Component': component_idx + 1,
                'Variable': data_columns[original_variable_idx],
                'Score': component_scores[variable_idx]
            }, ignore_index=True)

    return top_variables_df


# Configuration & Initialization
n_splits = 10
R = 100
seed = 0
K_range = list(range(2, 31))
num_top_variables_range = list(range(1, 25))
solvers = ['cd', 'mu']
init_methods = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']

W_list = []
errors = []

# Cross-validation and NMF fitting
for run in range(R):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=run)
    W_fold_list = []

    for train_index, test_index in kf.split(data):
        for solver in solvers:
            for init_method in init_methods:
                nmf_model = NMF(n_components=max(K_range), solver=solver, init=init_method, random_state=seed, max_iter=1000, tol=1e-4, alpha=0.1, l1_ratio=0.5)
                
                W_train, error = fit_and_evaluate_nmf(data.iloc[train_index], data.iloc[test_index], nmf_model)
                errors.append(error)
                W_fold_list.append(W_train)
        
    W_list.append(np.concatenate(W_fold_list, axis=0))

mean_error = np.mean(errors)
print(f"Mean Reconstruction Error (Frobenius norm) over all runs and folds: {mean_error:.4f}")

W_aggregate = np.concatenate(W_list, axis=1)

# NMF Optimization over components and variables
best_nmf_model = None
best_variability_explained = 0.0
best_num_components = 0
best_num_variables_per_component = 0

for K in K_range:
    for num_top_variables in num_top_variables_range:
        for solver in solvers:
            for init_method in init_methods:
                final_nmf_model = NMF(n_components=K, solver=solver, init=init_method, random_state=seed, max_iter=500)
                W_consensus = final_nmf_model.fit_transform(W_aggregate)
                H_consensus = final_nmf_model.components_

                variability_explained = np.sum(H_consensus[:, :num_top_variables])
                if variability_explained > best_variability_explained:
                    best_nmf_model = final_nmf_model
                    best_variability_explained = variability_explained
                    best_num_components = K
                    best_num_variables_per_component = num_top_variables

# Evaluate the best model on original data
W_consensus = best_nmf_model.fit_transform(data)
H_consensus = best_nmf_model.components_
reconstructed = np.dot(W_consensus, H_consensus)
relative_error = np.linalg.norm(data - reconstructed) / np.linalg.norm(data)
print("Relative error:", relative_error)```
0

There are 0 best solutions below