DBSCAN returns only 1 cluster

284 Views Asked by At

Please tell me why DBSCAN returns only 1 cluster? How to fix this problem? I pass the previously calculated distances to the DBSCAN constructor. I also tried to change the input data. Changed the dataset and so on. Please tell me, what could be the problem? Maybe it's worth changing the algorithm?

My code:

import pandas as pd
import numpy as np
import ppdeep as pp

from sklearn.cluster import DBSCAN

init_data = pd.read_csv('./spam2.csv')
data = init_data[['Label', 'Body']]
data.dropna(inplace=True)
data = data.rename(columns={'Body': 'message', 'Label': 'class'})
data = data.reindex(columns=['class', 'message'])
data = data.drop_duplicates(subset='message', keep="first")

EMAIL_RE = re.compile("[\w.+-]+@[\w-]+\.[\w.-]+")
URLS_RE = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:\%[0-9a-fA-F][0-9a-fA-F]))+")
PUNCTUATION_RE = re.compile(r'[!"\#\$%\&\'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~]')
NOT_LETTERS_OR_SPACE_RE = re.compile("[^A-Za-z ]")
REPEATING_LETTERS_RE = re.compile(r'([a-z])\1{2,}')

def prepare_message(message):
    # Convert to lower case
    text = message.lower()

    # Сonverts HTML codes into characters
    text = html.unescape(text)

    # Remove email
    text = re.sub(EMAIL_RE, ' ', text)

    # Remove urls
    text = re.sub(URLS_RE, ' ', text)

    # Remove all punctuation symbols
    text = re.sub(PUNCTUATION_RE, ' ', text)

    # Remove all except letters and space
    text = re.sub(NOT_LETTERS_OR_SPACE_RE, '', text)

    # Replace repeating letters
    text = re.sub(REPEATING_LETTERS_RE, r'\1', text)

    # Split by space and stemming with PorterStemmer
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

data['message'] = data['message'].apply(prepare_message)

data['hash'] = data['message'].apply(pp.hash)

x = data['hash']
y = data['class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y)

import numba

@numba.jit(parallel=True, cache=True, fastmath=True)
def calc_distances(x_train, x):
    count = 0
    n = len(x_train)
    max_count = (n**2 - n) // 2
    for i in range(n):
        for j in range(i):
            x[i,j] = pp.compare(x_train[i], x_train[j])
            x[j,i] = x[i,j]
            count += 1
            print(f"\r{count}/{max_count}", end='')
        
n = len(x_train)
distances = np.zeros((n, n))
calc_distances(list(x_train), distances)
np.fill_diagonal(distances, 100.0)

db = DBSCAN(eps=0.5, min_samples=2, min_metric='precomputed').fit(distances)

labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

I tried all the options for DBSCAN parameters

0

There are 0 best solutions below