Please tell me why DBSCAN returns only 1 cluster? How to fix this problem? I pass the previously calculated distances to the DBSCAN constructor. I also tried to change the input data. Changed the dataset and so on. Please tell me, what could be the problem? Maybe it's worth changing the algorithm?
My code:
import pandas as pd
import numpy as np
import ppdeep as pp
from sklearn.cluster import DBSCAN
init_data = pd.read_csv('./spam2.csv')
data = init_data[['Label', 'Body']]
data.dropna(inplace=True)
data = data.rename(columns={'Body': 'message', 'Label': 'class'})
data = data.reindex(columns=['class', 'message'])
data = data.drop_duplicates(subset='message', keep="first")
EMAIL_RE = re.compile("[\w.+-]+@[\w-]+\.[\w.-]+")
URLS_RE = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:\%[0-9a-fA-F][0-9a-fA-F]))+")
PUNCTUATION_RE = re.compile(r'[!"\#\$%\&\'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~]')
NOT_LETTERS_OR_SPACE_RE = re.compile("[^A-Za-z ]")
REPEATING_LETTERS_RE = re.compile(r'([a-z])\1{2,}')
def prepare_message(message):
# Convert to lower case
text = message.lower()
# Сonverts HTML codes into characters
text = html.unescape(text)
# Remove email
text = re.sub(EMAIL_RE, ' ', text)
# Remove urls
text = re.sub(URLS_RE, ' ', text)
# Remove all punctuation symbols
text = re.sub(PUNCTUATION_RE, ' ', text)
# Remove all except letters and space
text = re.sub(NOT_LETTERS_OR_SPACE_RE, '', text)
# Replace repeating letters
text = re.sub(REPEATING_LETTERS_RE, r'\1', text)
# Split by space and stemming with PorterStemmer
ps = PorterStemmer()
return ' '.join([ps.stem(word) for word in text.split()])
data['message'] = data['message'].apply(prepare_message)
data['hash'] = data['message'].apply(pp.hash)
x = data['hash']
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y)
import numba
@numba.jit(parallel=True, cache=True, fastmath=True)
def calc_distances(x_train, x):
count = 0
n = len(x_train)
max_count = (n**2 - n) // 2
for i in range(n):
for j in range(i):
x[i,j] = pp.compare(x_train[i], x_train[j])
x[j,i] = x[i,j]
count += 1
print(f"\r{count}/{max_count}", end='')
n = len(x_train)
distances = np.zeros((n, n))
calc_distances(list(x_train), distances)
np.fill_diagonal(distances, 100.0)
db = DBSCAN(eps=0.5, min_samples=2, min_metric='precomputed').fit(distances)
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
I tried all the options for DBSCAN parameters