Beside this post, I experimented with KNN algorithms using sklearn and PYOD packages for unsupervised approach on benchmark dataset for anomaly detection task and I get different results!
*****************************************************************KNN from PYOD lib
/usr/local/lib/python3.7/dist-packages/pyod/models/base.py:413: UserWarning: y should not be presented in unsupervised learning.
"y should not be presented in unsupervised learning.")
Training time: 3.3526198863983154s
precision recall f1-score support
0 0.96 0.90 0.93 16955
1 0.01 0.04 0.02 663
accuracy 0.86 17618
macro avg 0.49 0.47 0.47 17618
weighted avg 0.92 0.86 0.89 17618
*****************************************************************KNN from sklearn lib
Training time: 0.6735050678253174s
precision recall f1-score support
0 1.00 1.00 1.00 16955
1 1.00 1.00 1.00 663
accuracy 1.00 17618
macro avg 1.00 1.00 1.00 17618
weighted avg 1.00 1.00 1.00 17618
I tried to set different arguments for 2nd package by setting contamination
, n_neighbors
but didn't reach results of the sklearn package. Can someone explain me or find the solution to fix it for comparing the results.
The full code for pre-processing without normalization since data is clean considering this answer to avoid bias.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
from sklearn import metrics
dataset_name = 'http'
from sklearn.datasets import fetch_kddcup99
http = fetch_kddcup99(subset='http', data_home=None, shuffle=False, random_state=None, percent10=True,
download_if_missing=True, return_X_y=False, as_frame=True)
df = http.frame
name_target = 'labels'
#change types of feature columns
df['duration']=df['duration'].astype(float)
df['src_bytes']=df['src_bytes'].astype(float)
df['dst_bytes']=df['dst_bytes'].astype(float)
num_row , num_colmn = df.shape
#calculate number of classes
classes = df[name_target].unique()
num_class = len(classes)
print(df[name_target].value_counts())
#determine which class is normal (is not anomaly)
label = np.array(df[name_target])
a,b = np.unique(label , return_counts=True)
#print("a is:",a)
#print("b is:",b)
for i in range(len(b)):
if b[i]== b.max():
normal = a[i]
#print('normal:', normal)
elif b[i] == b.min():
unnormal = a[i]
#print('unnorm:' ,unnormal)
# show anomaly classes
anomaly_class = []
for f in range(len(a)):
if a[f] != normal:
anomaly_class.append(a[f])
# convert dataset classes to 2 classe: normal and unnormal
label = np.where(label != normal, unnormal ,label)
df[name_target]=label
# showing columns's type: numerical or categorical
numeric =0
categoric = 0
for i in range(df.shape[1]):
df_col = df.iloc[:,i]
if df_col.dtype == int and df.columns[i] != name_target:
numeric +=1
elif df_col.dtype == float and df.columns[i] != name_target:
numeric += 1
elif df.columns[i] != name_target:
categoric += 1
#replace labels with 0 and 1
label = np.where(label == normal, 0 ,1)
df[name_target]=label
# null_check: if more than half of a column was null, then that columns will be droped
# otherwise if number of null was less than half of column, then nulls will replace with mean of that column
test = []
for i in range(df.shape[1]):
if df.iloc[:,i].isnull().sum() > df.shape[0]//2:
test.append(i)
elif df.iloc[:,i].isnull().sum() < df.shape[0]//2 and df.iloc[:,i].isnull().sum() != 0:
m = df.iloc[:,i].mean()
df.iloc[:,i] = df.iloc[:,i].replace(to_replace = np.nan, value = m)
df = df.drop(columns=df.columns[test])
#calculate anomaly rate
b = df[name_target].value_counts()
Anomaly_rate= b[1] / (b[0]+b[1])
print(Anomaly_rate)
contamination= float("{:.4f}".format(Anomaly_rate))
print(contamination)
#rename labels column
df = df.rename(columns = {'labels' : 'binary_target'})
#df.to_csv(f'/content/{dataset_name}.csv', index = False)
The full code of implementation of KNN models:
!pip install pyod
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import time
from sklearn import metrics
import os
import seaborn as sns
if contamination > 0.5:
contamination = 0.5
#X, y = df.loc[:, df.columns!= 'binary_target'], df['binary_target']
seed = 120
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed,stratify=y)
#X_train, X_test, y_train, y_test= train_test_split(df.drop(['binary_target'], axis=1), df['binary_target'], test_size=test_size, random_state=seed)
#create a dataframe
df_all = pd.DataFrame(columns =["method",'TP', 'FP','TN','FN','Accuracy', 'Precision', 'Recall', 'F1_score','Training Time(s)'])
index = df_all.index
index.name = dataset_name
numb = len(df_all)+1
#**********************************************************************KNN
print('*****************************************************************KNN from PYOD lib')
from pyod.models.knn import KNN
model_name_2 = 'KNN'
# train kNN detector
clf_name = 'KNN'
clf = KNN()
start = time.time()
clf.fit(X_train,y_train)
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores_knn = clf.decision_function(X_test) # outlier scores
stop = time.time()
train_time_knn = stop - start
print(f"Training time: {stop - start}s")
predictions = [round(value) for value in y_test_pred]
accuracy = accuracy_score(y_test, predictions)
accuracy_2 = accuracy * 100.0
for i in range(0,len(predictions)):
if predictions[i] > 0.5:
predictions[i]=1
else:
predictions[i]=0
predictions_2 = predictions
# calculate prediction,recall, f1-score
from sklearn.metrics import f1_score,recall_score,precision_score
precision = precision_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score = f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score_2 = np.mean(f1_score)
precision_2 = np.mean(precision)
recall_2 = np.mean(recall)
# evaluate the classification_report
print(classification_report(y_test, predictions_2))
# evaluate the confusion_matrix
cf_matrix =confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
#**********************************************************************KNN_sklearn
print('*****************************************************************KNN from sklearn lib')
from sklearn.neighbors import KNeighborsClassifier
model_name_6 = 'KNN_sklearn'
# train knn detector
neigh = KNeighborsClassifier()
start = time.time()
neigh.fit(X_train,y_train)
# get the prediction on the test data
y_test_pred_6 = neigh.predict(X_test)
stop = time.time()
train_time_knn_sklearn = stop - start
print(f"Training time: {stop - start}s")
#*****************************************************
predictions = [round(value) for value in y_test_pred_6]
accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))
accuracy_6 = accuracy * 100.0
for i in range(0,len(predictions)):
if predictions[i] > 0.5:
predictions[i]=1
else:
predictions[i]=0
predictions_6 = predictions
# calculate prediction,recall, f1-score
from sklearn.metrics import f1_score,recall_score,precision_score
precision = precision_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score = f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score_6 = np.mean(f1_score)
precision_6 = np.mean(precision)
recall_6 = np.mean(recall)
# evaluate the classification_report
print(classification_report(y_test, predictions_6))
# evaluate the confusion_matrix
cf_matrix =confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
kNN as a supervised classifier (as in sklearn's
KNN
class) predicts based on the class of a point'sk
nearest neighbors.kNN as an unsupervised outlier detector (as in PyOD's
KNN
class) measures the distance to thek
nearest neighbor(s) and predicts a point to be an outlier if that distance is "large".So there's no reason to expect them to produce the same predictions: for a given problem, the positive class needn't be "outliers" in the traditional sense.