I am trying to classify data points from this dataset: https://sharon.srworkspace.com/ml/datasets/hw1/wine.data.csv. I am utilizing a Gaussian Bayes and Gaussian Naive Bayes classifiers almost from scratch in Python. So, after the train test split for the model, I implemented these functions to classify the data points:

import numpy as np
from scipy.stats import multivariate_normal

def classify_point_gaussian_bayes(x):
    classes = np.unique(y)
    likelihoods = []
    
    for c in classes:
        class_data = data[y == c]
        prior = len(class_data) / len(data)
        mean = np.mean(class_data, axis=0)
        cov = np.cov(class_data.T)
                
        likelihood = multivariate_normal.pdf(x_reshaped, mean=mean, cov=cov, allow_singular=True)
        likelihoods.append(prior * likelihood)
    
    return classes[np.argmax(likelihoods)]

def classify_point_gaussian_naive_bayes(x):
    classes = np.unique(y)
    likelihoods = []
    
    for c in classes:
        class_data = data[y == c]
        prior = len(class_data) / len(data)
        mean = np.mean(class_data, axis=0)
        var = np.var(class_data, axis=0)
                
        likelihood = multivariate_normal.pdf(x_reshaped, mean=mean, cov=np.diag(var), allow_singular=True)
        likelihoods.append(prior * likelihood)
    
    return classes[np.argmax(likelihoods)]

And then I have to look at the test accuracies for both methods, which I did in this form:

res = []
for idx, test_point in enumerate(X_test.values):
    res.append(classify_point_gaussian_bayes(test_point) == y_test[idx])
print(f'Test accuracy for gaussian bayes is {res.count(True)/len(res)}')

res = []
for idx, test_point in enumerate(X_test.values):
    res.append(classify_point_gaussian_naive_bayes(test_point) == y_test[idx])
print(f'Test accuracy for gaussian naive bayes is {res.count(True)/len(res)}')

But I continue to have the same error: ValueError: operands could not be broadcast together with shapes (1,13) (14,).


More specifically:

ValueError                                Traceback (most recent call last)
Cell In[42], line 3
      1 res = []
      2 for idx, test_point in enumerate(X_test.values):
----> 3   res.append(classify_point_gaussian_bayes(test_point) == y_test[idx])
      4 print(f'Test accuracy for gaussian bayes is {res.count(True)/len(res)}')
      6 res = []

Cell In[41], line 21
     18     # Reshape x to have the same number of features as the mean
     19     x_reshaped = x.reshape(1, -1)
---> 21     likelihood = multivariate_normal.pdf(x_reshaped, mean=mean, cov=cov, allow_singular=True)
     22     likelihoods.append(prior * likelihood)
     24 return classes[np.argmax(likelihoods)]

File c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_multivariate.py:583, in multivariate_normal_gen.pdf(self, x, mean, cov, allow_singular)
    581 dim, mean, cov_object = params
    582 x = self._process_quantiles(x, dim)
--> 583 out = np.exp(self._logpdf(x, mean, cov_object))
    584 if np.any((cov_object.rank < dim)):
    585     out_of_bounds = ~cov_object._support_mask(x-mean)

File c:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_multivariate.py:526, in multivariate_normal_gen._logpdf(self, x, mean, cov_object)
    507 """Log of the multivariate normal probability density function.
    508 
    509 Parameters
   (...)
    523 
    524 """
    525 log_det_cov, rank = cov_object.log_pdet, cov_object.rank
--> 526 dev = x - mean
    527 if dev.ndim > 1:
    528     log_det_cov = log_det_cov[..., np.newaxis]

ValueError: operands could not be broadcast together with shapes (1,13) (14,) 

Since it is a problem about dimensions, I tried to resize the x data points that the functions take as argument with this line in both functions: x_reshaped = x.reshape(1, -1) and even: x_reshaped = x.reshape(-1). But it did not work and still gives me the same error as above.

0

There are 0 best solutions below