I have a dataset and want to run a PCA plot. In this plot the observations should be grouped in the same colour based on name
column (habillage = a$name
). Additionally, I want that single observation shows to which group it corresponds in terms of Age
. I found that label = "none"
does not show it, but if I write label = a$Age
nothing changes. Finally, how avoid showing in the legens a black/white text, which duplicates habillage = a$name
?
libary(plyr)
library(dplyr)
library(factoextra)
df<-structure(list(effective_status = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("ACTIVE", "PAUSED"), class = "factor"),
Age = structure(c(3L, 8L, 6L, 4L, 4L, 5L, 4L, 2L, 4L, 8L,
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L
), .Label = c("13-17", "18-24", "25-34", "35-44", "45-54",
"55-64", "65+", "Unknown"), class = "factor"), name = structure(c(19L,
23L, 18L, 22L, 9L, 6L, 6L, 9L, 15L, 14L, 12L, 14L, 12L, 13L,
15L, 10L, 11L, 20L, 9L, 13L, 19L, 6L, 9L, 10L, 13L, 14L,
19L, 20L, 21L, 22L, 6L, 10L, 11L, 13L, 14L, 18L, 23L, 12L,
21L, 22L), .Label = c("Automated Boost", "Competitors January",
"Dynamic Ad", "Focus campaign", "Marketing 0-25", "Marketing April",
"Marketing August", "Marketing December", "Marketing February",
"Marketing January", "Marketing July", "Marketing June",
"Marketing March", "Marketing May", "Upsell April", "Upsell August",
"Upsell Boost", "Upsell February", "Upsell January", "Upsell July",
"Upsell June", "Upsell March", "Upsell May"), class = "factor"),
n_obs = c(1L, 1L, 1L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), Clicks = c(1364L,
0L, 4919L, 2597L, 2641L, 0L, 915L, 1104L, 63L, 0L, 242L,
206L, 3661L, 11L, 33L, 0L, 246L, 247L, 4L, 0L, 0L, 0L, 0L,0L, 0L, 2009L, 0L, 43L, 166L, 0L, 17L, 0L, 95L, 0L, 137L,
0L, 68L, 0L, 0L, 0L), Impressions = c(12409L, 0L, 58222L,
30115L, 47119L, 0L, 18817L, 17068L, 4175L, 0L, 4528L, 9842L,
98421L, 3L, 6042L, 0L, 7154L, 4253L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 17117L, 0L, 857L, 1821L, 0L, 1034L, 0L, 1258L,
0L, 948L, 0L, 2972L, 0L, 0L, 0L), Reach = c(12164L, 0L, 46142L,
25282L, 35142L, 0L, 14843L, 13533L, 3624L, 0L, 4528L, 8394L,
58401L, 3L, 5874L, 0L, 7013L, 3586L, 202L, 0L, 0L, 0L, 0L,
150L, 0L, 15349L, 0L, 819L, 1810L, 0L, 1014L, 0L, 938L, 0L,
948L, 0L, 2782L, 0L, 0L, 0L), Spend = c(1153.11, 0, 9663.16,
3202.1, 3393.49, 0, 1739.37, 1344.19, 501.88, 0, 299.22,
565.74, 11228.5, 0.15, 609.05, 0, 709.19, 478.98, 26.12,
0, 0, 0, 0, 22.25, 0, 2485.04, 0, 232.14, 256.1, 0, 129.6,
0, 157.25, 0, 122.62, 0, 717.32, 0, 0, 0.05), Purchase = c(140L,
163L, 104L, 33L, 22L, 17L, 11L, 13L, 2L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), PurchaseValue = c(221595.22,
173029.62, 101894.91, 38974.63, 27336.71, 13247.8, 12461.66,
6186.55, 3754.31, 971.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Date_minus_start_time = c(9, 13, 15, 26.3055555555556, 29,
5.5, 5.5, 19, 17, 16.5, 2, 27, 10, 0, 29, 26.5, 13, 15, 19,
43.9583333333333, 30, 5, 28, 8, 29.9583333333333, 21, 19,
3, 9, 17.5, 28, 10, 14, 30.4791666666667, 0, 11, 15, 18,
21, 5)), row.names = c(NA, -40L), groups = structure(list(
effective_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("ACTIVE", "PAUSED"), class = "factor"), Age = structure(2:8, .Label = c("13-17",
"18-24", "25-34", "35-44", "45-54", "55-64", "65+", "Unknown"
), class = "factor"), .rows = structure(list(c(8L, 11L, 12L
), c(1L, 13L, 14L, 15L), c(4L, 5L, 7L, 9L, 16L, 17L, 18L),
c(6L, 19L, 20L, 21L), c(3L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L), 31:37, c(2L, 10L, 38L, 39L, 40L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
a <- subset(helmes[sample(nrow(helmes), 100), ], !(name %in% c("Upsell Boost","Marketing 0-25","Dynamic Ad"))) %>%
group_by(effective_status,Age,name) %>%
summarise(
n_obs = n(),
Clicks = sum(Clicks,na.rm = TRUE),
Impressions = sum(Impressions,na.rm = TRUE),
Reach = sum(Reach,na.rm = TRUE),
Spend = sum(Spend,na.rm = TRUE),
Purchase = sum(Purchase,na.rm = TRUE),
PurchaseValue = sum(PurchaseValue,na.rm = TRUE),
Date_minus_start_time = mean(Date_minus_start_time,na.rm = TRUE)
) %>% arrange(desc(PurchaseValue))
res.pca <- prcomp(a[4:ncol(a)], scale = TRUE)
fviz_pca_ind(res.pca,
#col.ind = a$name, # color by groups
label = "none",
#geom = c("point","text"),
habillage = a$name, # color by groups
#palette = c("#00AFBB", "#FC4E07", "#2CA25F"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE )
You can extract the computed PCA scores and then do your own ggplot:
Created on 2021-09-17 by the reprex package (v2.0.1)