Why is SSE increasing at a specific cluster number in python? (K MEANS clustering)

303 Views Asked by At

I'm using the same dataset with a distinct label array for calculating SSE on the code below:

import sklearn
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
import os
import pandas as pd
import xlrd
import pickle
import csv
from numpy import savetxt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import matplotlib.cm as cm

n_cluster = 8
final = []
array_final = []
label = []
acabou = []

for c in range (0,n_cluster):

    for n in range (2,7):
        xlsx = pd.ExcelFile("C:/Users/guilh/Desktop/SENSORIAMENTO/SENTINEL_2/{}/Cluster_{}/MEDIANA_BANDA_CLUSTER{}{}.xlsx".format(n_cluster,c,n,c))
        df = pd.read_excel(xlsx, 'Sheet1', keep_default_na = False)

        numpy_array = df.to_numpy()
        one_column_array = numpy_array.flatten(order="A")
        one_column_array = [var for var in one_column_array if var]
        final = one_column_array
        array_final.append(np.char.replace(final, ',', '.'))  

 
    #criando uma array com o número de cada Cluster
    x = len(one_column_array)  
    acabou.append(np.stack(array_final, axis=-1))
    seila = np.empty(x, np.int64 ,  order='C')
    seila.fill(c+1)
    label.append(seila)
    array_final = []
    
acabou_valor = np.concatenate(acabou)
acabou_label = np.concatenate(label)

#CALCULANDO SSE 

BANDA = []
for i in range (0,5):
    aa = acabou_valor[:,i].astype(np.float)
    BANDA.append(aa)

soma = []

#SSE for each cluster in each band

for i in range (0,n_cluster):
    for j in range (0,5):
        aa = BANDA[j][np.nonzero(acabou_label == i+1)]
        mm = np.mean(aa)
        bb = np.square(np.subtract(aa, mm))
        cc = bb.sum()
        soma.append(cc)

cu = []
cu2 = []
cu3 = []
cu4 = []
cu5 = []
cu6 = []
cu7 = []
cu8 = []

for i in range(0,5):
    #print(soma[i])
    a = soma[i]
    cu.append(a)

cluster1 = []
cluster1.append(sum(cu))

for i in range(5,10):
    #print(soma[i])
    a = soma[i]
    cu2.append(a)

cluster2 = []
cluster2.append(sum(cu2))

for i in range(10,15):
    #print(soma[i])
    a = soma[i]
    cu3.append(a)

cluster3 = []
cluster3.append(sum(cu3))

for i in range(15,20):
    #print(soma[i])
    a = soma[i]
    cu4.append(a)

cluster4 = []
cluster4.append(sum(cu4))

for i in range(20,25):
    #print(soma[i])
    a = soma[i]
    cu5.append(a)

cluster5 = []
cluster5.append(sum(cu5))

for i in range(25,30):
    #print(soma[i])
    a = soma[i]
    cu6.append(a)

cluster6 = []
cluster6.append(sum(cu6))

for i in range(30,35):
    #print(soma[i])
    a = soma[i]
    cu7.append(a)

cluster7 = []
cluster7.append(sum(cu7))

for i in range(35,40):
    #print(soma[i])
    a = soma[i]
    cu8.append(a)

cluster8 = []
cluster8.append(sum(cu8))

SSE_KMEANS = [cluster1[i]+cluster2[i]+cluster3[i]+cluster4[i]+cluster5[i]+cluster6[i]+cluster7[i]+cluster8[i] for i in range(len(cluster1))]

print(SSE_KMEANS)
print(cluster1)
print(cluster2)
print(cluster3)
print(cluster4)
print(cluster5)
print(cluster6)
print(cluster7)
print(cluster8)

Everything was fine and the SSE was decreasing as K goes up (as expected) until I tried to calculate it with K = 8, and it increased by nearly 50 relative to K = 7. I checked the code quite a few times to see if there's anything being calculated wrong but couldn't find anything. Could someone help me?

The first part of the code is being used to create a dataset from rasters that were converted to .xls, and I was able to even calculate the silhouette index for each Cluster with it, so I think everything is fine there. The second part is for SSE.

The K Means clustering was already done on GEE, I'm just working with the exported data on python.

I'm working with clustering in Sentinel-2 images; with Spyder through anaconda; python version 3.8.3.

0

There are 0 best solutions below