Finding the optimal number of latent factors for Spark ALS algorithm and dealing with errors

21 Views Asked by At

I'm having a trouble finding the optimal number of latent factors for Spark ALS when I try to execute the code. I have a dataframe of 3 columns: user_id, item_id and click_count It is 34,000,000 records long. The unique number of user_ids is 3,000,000 and the unique number of item_ids is 1,700. Here's the code I'm using:

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np
import matplotlib.pyplot as plt

# SparkSession is instantiated
# Data is loaded to df

# Elbow Method for optimal number of latent factors
def elbow_method(data, max_factors=20):
    rmse_values = []
    for num_factors in range(1, max_factors + 1):
        als = ALS(userCol="user_id", itemCol="item_id", ratingCol="click_count", coldStartStrategy="drop", nonnegative=True, rank=num_factors)
        model = als.fit(data)
        predictions = model.transform(data)
        
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        rmse_values.append(rmse)
        
    return rmse_values

# Get RMSE values for different number of factors
max_factors = 20
rmse_values = elbow_method(df, max_factors)

# Plot RMSE values
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_factors + 1), rmse_values, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal Number of Latent Factors')
plt.xlabel('Number of Latent Factors')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()

# Find the optimal number of factors
optimal_num_factors = np.argmin(rmse_values) + 1
print("Optimal number of latent factors:", optimal_num_factors)

And I'm getting this error:

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-2cb920dac39c>", line 31, in <module>
    rmse_values = elbow_method(df, max_factors)
  File "<ipython-input-11-2cb920dac39c>", line 20, in elbow_method
    model = als.fit(data)
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/ml/base.py", line 205, in fit
    return self._fit(dataset)
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 381, in _fit
    java_model = self._fit_java(dataset)
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 378, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2077, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'Py4JJavaError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
0

There are 0 best solutions below