Clustering around fixed vector of values

184 Views Asked by At

I have a dataset of brands with different features like calories, sugar content, fiber content, etc. for eg Example

Using dput():

structure(list(Row = 1:30, Brands = structure(c(1L, 112L, 223L,  242L,
253L, 264L, 275L, 286L, 297L, 2L, 13L, 24L, 35L, 46L, 57L,  68L, 79L,
90L, 101L, 113L, 124L, 135L, 146L, 157L, 168L, 179L,  190L, 201L,
212L, 224L), .Label = c("Brand 1", "Brand 10", "Brand 100",  "Brand
101", "Brand 102", "Brand 103", "Brand 104", "Brand 105",  "Brand
106", "Brand 107", "Brand 108", "Brand 109", "Brand 11",  "Brand 110",
"Brand 111", "Brand 112", "Brand 113", "Brand 114",  "Brand 115",
"Brand 116", "Brand 117", "Brand 118", "Brand 119",  "Brand 12",
"Brand 120", "Brand 121", "Brand 122", "Brand 123",  "Brand 124",
"Brand 125", "Brand 126", "Brand 127", "Brand 128",  "Brand 129",
"Brand 13", "Brand 130", "Brand 131", "Brand 132",  "Brand 133",
"Brand 134", "Brand 135", "Brand 136", "Brand 137",  "Brand 138",
"Brand 139", "Brand 14", "Brand 140", "Brand 141",  "Brand 142",
"Brand 143", "Brand 144", "Brand 145", "Brand 146",  "Brand 147",
"Brand 148", "Brand 149", "Brand 15", "Brand 150",  "Brand 151",
"Brand 152", "Brand 153", "Brand 154", "Brand 155",  "Brand 156",
"Brand 157", "Brand 158", "Brand 159", "Brand 16",  "Brand 160",
"Brand 161", "Brand 162", "Brand 163", "Brand 164",  "Brand 165",
"Brand 166", "Brand 167", "Brand 168", "Brand 169",  "Brand 17",
"Brand 170", "Brand 171", "Brand 172", "Brand 173",  "Brand 174",
"Brand 175", "Brand 176", "Brand 177", "Brand 178",  "Brand 179",
"Brand 18", "Brand 180", "Brand 181", "Brand 182",  "Brand 183",
"Brand 184", "Brand 185", "Brand 186", "Brand 187",  "Brand 188",
"Brand 189", "Brand 19", "Brand 190", "Brand 191",  "Brand 192",
"Brand 193", "Brand 194", "Brand 195", "Brand 196",  "Brand 197",
"Brand 198", "Brand 199", "Brand 2", "Brand 20",  "Brand 200", "Brand
201", "Brand 202", "Brand 203", "Brand 204",  "Brand 205", "Brand
206", "Brand 207", "Brand 208", "Brand 209",  "Brand 21", "Brand 210",
"Brand 211", "Brand 212", "Brand 213",  "Brand 214", "Brand 215",
"Brand 216", "Brand 217", "Brand 218",  "Brand 219", "Brand 22",
"Brand 220", "Brand 221", "Brand 222",  "Brand 223", "Brand 224",
"Brand 225", "Brand 226", "Brand 227",  "Brand 228", "Brand 229",
"Brand 23", "Brand 230", "Brand 231",  "Brand 232", "Brand 233",
"Brand 234", "Brand 235", "Brand 236",  "Brand 237", "Brand 238",
"Brand 239", "Brand 24", "Brand 240",  "Brand 241", "Brand 242",
"Brand 243", "Brand 244", "Brand 245",  "Brand 246", "Brand 247",
"Brand 248", "Brand 249", "Brand 25",  "Brand 250", "Brand 251",
"Brand 252", "Brand 253", "Brand 254",  "Brand 255", "Brand 256",
"Brand 257", "Brand 258", "Brand 259",  "Brand 26", "Brand 260",
"Brand 261", "Brand 262", "Brand 263",  "Brand 264", "Brand 265",
"Brand 266", "Brand 267", "Brand 268",  "Brand 269", "Brand 27",
"Brand 270", "Brand 271", "Brand 272",  "Brand 273", "Brand 274",
"Brand 275", "Brand 276", "Brand 277",  "Brand 278", "Brand 279",
"Brand 28", "Brand 280", "Brand 281",  "Brand 282", "Brand 283",
"Brand 284", "Brand 285", "Brand 286",  "Brand 287", "Brand 288",
"Brand 289", "Brand 29", "Brand 290",  "Brand 291", "Brand 292",
"Brand 293", "Brand 294", "Brand 295",  "Brand 296", "Brand 297",
"Brand 298", "Brand 299", "Brand 3",  "Brand 30", "Brand 300", "Brand
301", "Brand 302", "Brand 303",  "Brand 304", "Brand 305", "Brand
306", "Brand 307", "Brand 31",  "Brand 32", "Brand 33", "Brand 34",
"Brand 35", "Brand 36", "Brand 37",  "Brand 38", "Brand 39", "Brand
4", "Brand 40", "Brand 41", "Brand 42",  "Brand 43", "Brand 44",
"Brand 45", "Brand 46", "Brand 47", "Brand 48",  "Brand 49", "Brand
5", "Brand 50", "Brand 51", "Brand 52", "Brand 53",  "Brand 54",
"Brand 55", "Brand 56", "Brand 57", "Brand 58", "Brand 59",  "Brand
6", "Brand 60", "Brand 61", "Brand 62", "Brand 63", "Brand 64", 
"Brand 65", "Brand 66", "Brand 67", "Brand 68", "Brand 69", "Brand 7",
"Brand 70", "Brand 71", "Brand 72", "Brand 73", "Brand 74", "Brand
75",  "Brand 76", "Brand 77", "Brand 78", "Brand 79", "Brand 8",
"Brand 80",  "Brand 81", "Brand 82", "Brand 83", "Brand 84", "Brand
85", "Brand 86",  "Brand 87", "Brand 88", "Brand 89", "Brand 9",
"Brand 90", "Brand 91",  "Brand 92", "Brand 93", "Brand 94", "Brand
95", "Brand 96", "Brand 97",  "Brand 98", "Brand 99"), class =
"factor"), Fiber = c(82L, 36L,  51L, 86L, 26L, 98L, 91L, 28L, 1L, 88L,
35L, 84L, 27L, 58L, 9L,  43L, 49L, 56L, 66L, 43L, 62L, 73L, 20L, 33L,
17L, 88L, 57L, 45L,  89L, 16L), Sugar = c(77L, 87L, 40L, 69L, 9L, 1L,
54L, 64L, 24L,  52L, 29L, 14L, 76L, 24L, 39L, 54L, 18L, 72L, 54L, 9L,
45L, 65L,  43L, 90L, 40L, 93L, 75L, 50L, 1L, 44L), Calories = c(94L,
14L,  36L, 34L, 40L, 91L, 58L, 82L, 91L, 19L, 60L, 79L, 44L, 60L, 80L,
27L, 17L, 5L, 10L, 89L, 63L, 43L, 29L, 99L, 92L, 19L, 9L, 38L,  43L,
9L), Feature.4 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,  0L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,  0L, 1L, 0L,
1L), Feature.5 = c(1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,  1L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L,  1L, 0L, 1L, 1L, 1L,
0L), Feature.6 = c(7L, 11L, 45L, 45L, 35L,  28L, 56L, 52L, 1L, 49L,
28L, 68L, 99L, 70L, 62L, 73L, 97L, 2L,  41L, 14L, 68L, 84L, 76L, 2L,
53L, 38L, 3L, 52L, 12L, 70L), Feature.7 = c(54L,  22L, 11L, 67L, 22L,
67L, 69L, 67L, 89L, 24L, 32L, 25L, 90L, 62L,  82L, 100L, 53L, 50L,
75L, 79L, 53L, 4L, 31L, 96L, 55L, 35L, 69L,  74L, 88L, 9L)), row.names
= c(NA, 30L), class = "data.frame")

I want to pick 5 brands among them, say Brand 1,2,3,4&5 and then form clusters or groups of the brands similar in features to each of these 5 brands and keep all the rest of all the brands which are not similar as a separate cluster. So, I will have 1 cluster Brand 1, 1 for Brand 2, and l Brand 3 and similarly for Brand 4 & 5. And then there will be 1 cluster of those brands which are not similar to any of these 5 brands. The feature may be a dummy or continuous.

I think this is should be easy, however, I couldn't find any package for this in "R".

1

There are 1 best solutions below

0
On

Here is a simple example using the data you included which I am calling dta. First we compute z-scores for the values except for the dichotomies:

library(fields)
dta.zscores <- dta
dta.zscores[, c(3:5, 8:9)] <- scale(dta[, c(3:5, 8:9)])

Now dta.zscores contains the z-scores of the original data so that each variable will be weighted equally. Next we compute the distances from each row to rows 1 - 5 (Brands 1 through 5) using columns 3 through 9:

dta.dist <- rdist(dta.zscores[1:5, 3:9], dta.zscores[, 3:9])
dta.mindist <- apply(dta.dist, 2, min)
dta.brand <- apply(dta.dist, 2, which.min)
quantile(dta.mindist[-c(1:5)])
#       0%      25%      50%      75%     100% 
# 1.131532 1.952891 2.383079 2.908602 3.475676 
table(dta.brand)
# dta.brand
#  1  2  3  4  5 
#  4  2  7 11  6 

The matrix dta.dist is 30 columns (each of the observations) by 5 rows (distance to each of the brands). The vector dta.mindist is the minimum distance for each observation. That will be 0 for the first 5 observations. The vector dta.brand indicates which of the 5 brands is the nearest for that observation. The quartile function shows the range of distances to the nearest brand after excluding the first 5 brands which were used to define the groups. Finally the table shows how many observations are assigned to each brand.

You would still need to decide how far is too far to assign an observation to a brand and move these observations to another cluster, e.g. cluster 6. Based on the quantiles, 25% of the distances are greater than 2.9. You can specify other quantiles using the probs= argument in quantile(), e.g. .90 or .95 depending on how many observations you want to be in cluster 6.