I've got a dataset on an excel file and I'm trying to train a Bayesian Model in order to make some predictions on the data. I'm using pandas for the dataframe, and pgmpy for the Bayesian Model. I'm able to make the model make predictions on data from the dataset, dropping the columns I want to predict. But as soon as I try to make predictions on a new manually created dataframe, I get this stacktrace:
C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\pgmpy\factors\discrete\DiscreteFactor.py:518: UserWarning: Found unknown state name. Trying to switch to using all state names as state numbers
warn(
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\process_executor.py", line 431, in _process_worker
r = call_item()
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\pgmpy\inference\ExactInference.py", line 367, in map_query
final_distribution = self._variable_elimination(
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\pgmpy\inference\ExactInference.py", line 160, in _variable_elimination
working_factors = self._get_working_factors(evidence)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\pgmpy\inference\ExactInference.py", line 46, in _get_working_factors
factor_reduced = factor.reduce(
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\pgmpy\factors\discrete\DiscreteFactor.py", line 537, in reduce
phi.values = phi.values[tuple(slice_)]
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\viviana\Documents\GitHub\biblionetAI\src\main\java\it\unisa\c07\biblionet\moduloIntelligenzaArtificiale\questionarioNaiveBayesian.py", line 95, in <module>
predicted = model.predict(to_predict)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\pgmpy\models\BayesianModel.py", line 579, in predict
pred_values = Parallel(n_jobs=n_jobs)(
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1054, in __call__
self.retrieve()
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 933, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 542, in wrap_future_result
return future.result(timeout=timeout)
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py", line 433, in result
return self.__get_result()
File "C:\Users\viviana\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py", line 389, in __get_result
raise self._exception
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
This is the Python code:
dataset = pd.DataFrame(excel.values, columns=['user_answer', 'user_answer1', 'user_answer2',
'user_answer3', 'user_answer4', 'car_genre', 'car_genre1',
'car_genre2', 'genre'])
model = BayesianModel([('user_answer', 'car_genre'), ('user_answer', 'car_genre1'), ('user_answer', 'car_genre2'),
('user_answer1', 'car_genre'), ('user_answer1', 'car_genre1'), ('user_answer1', 'car_genre2'),
('user_answer2', 'car_genre'), ('user_answer2', 'car_genre1'), ('user_answer2', 'car_genre2'),
('user_answer3', 'car_genre'), ('user_answer3', 'car_genre1'), ('user_answer3', 'car_genre2'),
('user_answer4', 'car_genre'), ('user_answer4', 'car_genre1'), ('user_answer4', 'car_genre2'),
('car_genre', 'genre'), ('car_genre1', 'genre'), ('car_genre2', 'genre')])
train_number = int(math.ceil((len(dataset) / 100) * 90))-1
train_data = dataset[:train_number]
predict_data = dataset[train_number:]
model.fit(train_data, BayesianEstimator)
predict_data.pop('genre')
predict_data.pop('car_genre')
predict_data.pop('car_genre1')
predict_data.pop('car_genre2')
#This succeded
predicted = model.predict(predict_data)
answ1 = "Amaro"
answ2 = "Riflessivo, logico, osservatore"
answ3 = "Cinema, teatro, SerieTV"
answ4 = "Non li guardo perché non mi interessa il genere"
pt = pd.DataFrame({"user_answer": answ1, "user_answer1": answ2,
"user_answer2": answ3, "user_answer3": answ4}, index=[0])
#This fails and gives me the error
print(model.predict(pt))
What caused the error? Is there a specific way to format data? How can I fix?