I have around 5 different cases where I extract around 13/14 statistical features from each case. I wanted to create like anomaly detection where I reduce the feature matrix using Principal Component Analysis (PCA) and I thought of using Self-Organising Map (SOM) in order to help organize clusters so that it becomes much clearer and then I thought of Implementing using the following to be able to do anomaly detection (I got it from this link: Machine learning for anomaly detection and condition monitoring):
- The Mahalanobis distance metric
- Autoencoder model
The following questions are:
- Is my approach correct? (See the code below)
- How can I get the datapoints from the SOM in order to do the Mahalanobis distance metric on the new datapoints from SOM?
- How can I find the right parameters to use in SOM?
- What to do in case that the matrix from SOM has negative values?
- Can you explain what is quantization error and the smaller it is the better? Because I keep on getting error around this range:
quantization error: 0.8791745577185559
The code:
def cov_matrix(data, verbose=False):
covariance_matrix = np.cov(data, rowvar=False)
if is_pos_def(covariance_matrix):
inv_covariance_matrix = np.linalg.inv(covariance_matrix)
if is_pos_def(inv_covariance_matrix):
return covariance_matrix, inv_covariance_matrix
else:
print("Error: Inverse of Covariance Matrix is not positive definite!")
else:
print("Error: Covariance Matrix is not positive definite!")
def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):
inv_covariance_matrix = inv_cov_matrix
vars_mean = mean_distr
diff = data - vars_mean
md = []
for i in range(len(diff)):
md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
return md
def MD_detectOutliers(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
outliers = []
for i in range(len(dist)):
if dist[i] >= threshold:
outliers.append(i) # index of the outlier
return np.array(outliers)
def MD_threshold(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
return threshold
def is_pos_def(A):
if np.allclose(A, A.T):
try:
np.linalg.cholesky(A)
return True
except np.linalg.LinAlgError:
return False
else:
return False
## Get the Statistical features
## Form matrix
## Obtain the principal components
## Do SOM to the principal components (I am using miniSOM)
# Initialization of SOM and training:
som_shape = (1, 5)
full_PCA_dataframe_np = full_pca_dataframe.to_numpy()
som = MiniSom(som_shape[0], som_shape[1], full_PCA_dataframe_np.shape[1], sigma=.4, learning_rate=.15, neighborhood_function='gaussian')
som.train_batch(full_PCA_dataframe_np, 8000, verbose=True)
# each neuron represents a cluster
winner_coordinates = np.array([som.winner(x) for x in full_PCA_dataframe_np]).T
# with np.ravel_multi_index we convert the bidimensional coordinates to a monodimensional index
cluster_index = np.ravel_multi_index(winner_coordinates, som_shape)
# plotting the clusters using the first 2 dimentions of the data
for c in np.unique(cluster_index):
plt.scatter(full_PCA_dataframe_np[cluster_index == c, 0], full_PCA_dataframe_np[cluster_index == c, 1], label='cluster='+str(c), alpha=.5)
# plotting centroids
for centroid in som.get_weights():
plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', s=25, linewidths=5, color='k', label='centroid')
plt.legend()
plt.show()
## Get the datapoints and Implement the Mahalanobis distance metric on each case:
data_train = np.array(X_train_PCA.values) # Say Case 1
data_test = np.array(X_test_PCA.values) # Say Case 3
# Obtain the covaraince matrix and implement Mahalanobis distance:
cov_matrix, inv_cov_matrix = cov_matrix(data_train)
mean_distr = data_train.mean(axis=0)
dist_test = MahalanobisDist(inv_cov_matrix, mean_distr, data_test, verbose=False)
dist_train = MahalanobisDist(inv_cov_matrix, mean_distr, data_train, verbose=False)
threshold = MD_threshold(dist_train, extreme = True)
# Form matrix with anomaly column:
anomaly_train = pd.DataFrame()
anomaly_train['Mob dist']= dist_train
anomaly_train['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_train['Anomaly'] = anomaly_train['Mob dist'] > anomaly_train['Thresh']
anomaly_train.index = X_train_PCA.index
anomaly = pd.DataFrame()
anomaly['Mob dist']= dist_test
anomaly['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly['Anomaly'] = anomaly['Mob dist'] > anomaly['Thresh']
anomaly.index = X_test_PCA.index
anomaly.head()
Another question with regards to SOM, my input to SOM using PCA is 50 rows and 2 columns where I have 5 clusters. What do I need to input when it comes to SOM?
Here is my code using miniSOM
:
# Initialization of SOM and training:
som_shape = (7, 7)
full_PCA_dataframe_np = full_pca_dataframe.to_numpy()
som = MiniSom(som_shape[0], som_shape[1], full_PCA_dataframe_np.shape[1], sigma=.5, learning_rate=.5, neighborhood_function='gaussian')
som.train_batch(full_PCA_dataframe_np, 8000, verbose=True)