The reason of different results of KNN algorithm from PYOD & Sklearn packages

772 Views Asked by At

Beside this post, I experimented with KNN algorithms using sklearn and PYOD packages for unsupervised approach on benchmark dataset for anomaly detection task and I get different results!

*****************************************************************KNN from PYOD lib
/usr/local/lib/python3.7/dist-packages/pyod/models/base.py:413: UserWarning: y should not be presented in unsupervised learning.
  "y should not be presented in unsupervised learning.")
Training time: 3.3526198863983154s
              precision    recall  f1-score   support

           0       0.96      0.90      0.93     16955
           1       0.01      0.04      0.02       663

    accuracy                           0.86     17618
   macro avg       0.49      0.47      0.47     17618
weighted avg       0.92      0.86      0.89     17618

*****************************************************************KNN from sklearn lib
Training time: 0.6735050678253174s
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16955
           1       1.00      1.00      1.00       663

    accuracy                           1.00     17618
   macro avg       1.00      1.00      1.00     17618
weighted avg       1.00      1.00      1.00     17618

I tried to set different arguments for 2nd package by setting contamination, n_neighbors but didn't reach results of the sklearn package. Can someone explain me or find the solution to fix it for comparing the results.

The full code for pre-processing without normalization since data is clean considering this answer to avoid bias.


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
from sklearn import metrics


dataset_name = 'http'
from sklearn.datasets import fetch_kddcup99
http = fetch_kddcup99(subset='http', data_home=None, shuffle=False, random_state=None, percent10=True, 
                                download_if_missing=True, return_X_y=False, as_frame=True)
df = http.frame
name_target = 'labels'

#change types of feature columns
df['duration']=df['duration'].astype(float)
df['src_bytes']=df['src_bytes'].astype(float)
df['dst_bytes']=df['dst_bytes'].astype(float)

num_row , num_colmn = df.shape

#calculate number of classes
classes = df[name_target].unique()
num_class = len(classes)

print(df[name_target].value_counts())

#determine which class is normal (is not anomaly)
label = np.array(df[name_target])
a,b = np.unique(label , return_counts=True)
#print("a is:",a)
#print("b is:",b)
for i in range(len(b)):
  if b[i]== b.max():
    normal = a[i]
    #print('normal:', normal)
  elif b[i] == b.min():
    unnormal = a[i]
    #print('unnorm:' ,unnormal) 

# show anomaly classes
anomaly_class = []
for f in range(len(a)): 
  if a[f] != normal:
    anomaly_class.append(a[f])

# convert dataset classes to 2 classe: normal and unnormal
label = np.where(label != normal, unnormal ,label)
df[name_target]=label

# showing columns's type: numerical or categorical
numeric =0
categoric = 0
for i in range(df.shape[1]):
  df_col = df.iloc[:,i]
  if df_col.dtype == int and df.columns[i] != name_target:
    numeric +=1
  elif df_col.dtype == float and df.columns[i] != name_target:
    numeric += 1
  elif df.columns[i] != name_target:
    categoric += 1

#replace labels with 0 and 1
label = np.where(label == normal, 0 ,1)
df[name_target]=label


# null_check: if more than half of a column was null, then that columns will be droped
# otherwise if number of null was less than half of column, then nulls will replace with mean of that column
test = []
for i in range(df.shape[1]):
  if df.iloc[:,i].isnull().sum() > df.shape[0]//2:
    test.append(i)
  elif df.iloc[:,i].isnull().sum() < df.shape[0]//2 and df.iloc[:,i].isnull().sum() != 0:
    m = df.iloc[:,i].mean()
    df.iloc[:,i] = df.iloc[:,i].replace(to_replace = np.nan, value = m)
df = df.drop(columns=df.columns[test])



#calculate anomaly rate 
b = df[name_target].value_counts()
Anomaly_rate= b[1] / (b[0]+b[1])
print(Anomaly_rate)
contamination= float("{:.4f}".format(Anomaly_rate))
print(contamination)
#rename labels column
df = df.rename(columns = {'labels' : 'binary_target'})   

#df.to_csv(f'/content/{dataset_name}.csv', index = False) 

The full code of implementation of KNN models:

!pip install pyod

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import time
from sklearn import metrics
import os
import seaborn as sns


if contamination > 0.5:
  contamination = 0.5

#X, y = df.loc[:, df.columns!= 'binary_target'], df['binary_target']
seed = 120
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed,stratify=y)
#X_train, X_test, y_train, y_test= train_test_split(df.drop(['binary_target'], axis=1), df['binary_target'], test_size=test_size, random_state=seed)

#create a dataframe
df_all = pd.DataFrame(columns =["method",'TP', 'FP','TN','FN','Accuracy', 'Precision', 'Recall', 'F1_score','Training Time(s)'])
index = df_all.index
index.name = dataset_name

numb = len(df_all)+1

#**********************************************************************KNN



print('*****************************************************************KNN from PYOD lib')

from pyod.models.knn import KNN 
model_name_2 = 'KNN'
# train kNN detector
clf_name = 'KNN'
clf = KNN()
start = time.time()



clf.fit(X_train,y_train)

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores_knn = clf.decision_function(X_test)  # outlier scores

stop = time.time()
train_time_knn = stop - start
print(f"Training time: {stop - start}s")

predictions = [round(value) for value in y_test_pred]
accuracy = accuracy_score(y_test, predictions)
accuracy_2 = accuracy * 100.0

for i in range(0,len(predictions)):
  if predictions[i] > 0.5:
    predictions[i]=1
  else:
    predictions[i]=0

predictions_2 = predictions

# calculate prediction,recall, f1-score
from sklearn.metrics import f1_score,recall_score,precision_score
precision = precision_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score = f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score_2 = np.mean(f1_score)
precision_2 = np.mean(precision)
recall_2 = np.mean(recall)

# evaluate the classification_report
print(classification_report(y_test, predictions_2))

# evaluate the confusion_matrix
cf_matrix =confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()





#**********************************************************************KNN_sklearn


print('*****************************************************************KNN from sklearn lib')

from sklearn.neighbors import KNeighborsClassifier
model_name_6 = 'KNN_sklearn'
# train knn detector
neigh = KNeighborsClassifier()
start = time.time()
neigh.fit(X_train,y_train)

# get the prediction on the test data
y_test_pred_6 = neigh.predict(X_test)

stop = time.time()
train_time_knn_sklearn = stop - start
print(f"Training time: {stop - start}s")

#*****************************************************
predictions = [round(value) for value in y_test_pred_6]
accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))
accuracy_6 = accuracy * 100.0

for i in range(0,len(predictions)):
  if predictions[i] > 0.5:
    predictions[i]=1
  else:
    predictions[i]=0

predictions_6 = predictions

# calculate prediction,recall, f1-score
from sklearn.metrics import f1_score,recall_score,precision_score
precision = precision_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score = f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score_6 = np.mean(f1_score)
precision_6 = np.mean(precision)
recall_6 = np.mean(recall)

# evaluate the classification_report
print(classification_report(y_test, predictions_6))

# evaluate the confusion_matrix
cf_matrix =confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
1

There are 1 best solutions below

0
On

kNN as a supervised classifier (as in sklearn's KNN class) predicts based on the class of a point's k nearest neighbors.

kNN as an unsupervised outlier detector (as in PyOD's KNN class) measures the distance to the k nearest neighbor(s) and predicts a point to be an outlier if that distance is "large".

So there's no reason to expect them to produce the same predictions: for a given problem, the positive class needn't be "outliers" in the traditional sense.