I'm having a trouble finding the optimal number of latent factors for Spark ALS when I try to execute the code. I have a dataframe of 3 columns: user_id, item_id and click_count It is 34,000,000 records long. The unique number of user_ids is 3,000,000 and the unique number of item_ids is 1,700. Here's the code I'm using:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np
import matplotlib.pyplot as plt
# SparkSession is instantiated
# Data is loaded to df
# Elbow Method for optimal number of latent factors
def elbow_method(data, max_factors=20):
rmse_values = []
for num_factors in range(1, max_factors + 1):
als = ALS(userCol="user_id", itemCol="item_id", ratingCol="click_count", coldStartStrategy="drop", nonnegative=True, rank=num_factors)
model = als.fit(data)
predictions = model.transform(data)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
rmse_values.append(rmse)
return rmse_values
# Get RMSE values for different number of factors
max_factors = 20
rmse_values = elbow_method(df, max_factors)
# Plot RMSE values
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_factors + 1), rmse_values, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal Number of Latent Factors')
plt.xlabel('Number of Latent Factors')
plt.ylabel('RMSE')
plt.grid(True)
plt.show()
# Find the optimal number of factors
optimal_num_factors = np.argmin(rmse_values) + 1
print("Optimal number of latent factors:", optimal_num_factors)
And I'm getting this error:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/opt/python/envs/minimal/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-11-2cb920dac39c>", line 31, in <module>
rmse_values = elbow_method(df, max_factors)
File "<ipython-input-11-2cb920dac39c>", line 20, in elbow_method
model = als.fit(data)
File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/ml/base.py", line 205, in fit
return self._fit(dataset)
File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 381, in _fit
java_model = self._fit_java(dataset)
File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 378, in _fit_java
return self._java_obj.fit(dataset._jdf)
File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/java_gateway.py", line 1322, in __call__
return_value = get_return_value(
File "/opt/python/envs/minimal/lib/python3.8/site-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
return f(*a, **kw)
File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/python/envs/minimal/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2077, in showtraceback
stb = value._render_traceback_()
AttributeError: 'Py4JJavaError' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/clientserver.py", line 516, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/opt/python/envs/minimal/lib/python3.8/site-packages/py4j/clientserver.py", line 539, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving