How can I create stacked boxplot in r that shows all datapoint, and connect same subject with line

43 Views Asked by At

Hi I am trying to create a stacked boxplot where the x-axis are the two separated by two Arms (A and B), the y-axis are the median_score and the fill is the time. After that, I would like to show each subject in their corresponding boxplot with unique color for each subject as a point and use line to connect the same subject within each Arm.

However, I am not able to get the datapoint as each patient as unique color and not able to connect the same subject with a line.

Here's the subset of my dataframe

# Create the data frame
median_score_df <- data.frame(
  subject = c("01", "02", "03", "01", "02", "03", "01", "02", "03", "04", "05", "06", "04", "05", "06", "04", "05", "06"),
  median_score = c(-0.0310019210668711, 0.0444211182227502, 0.200311923393836, 0.221255710308368, 0.171509500650259, 0.196983984433688, 0.236694770226112, 0.206757913002601, 0.211477853334867, 0.284073710074792, 0.202858539728018, 0.147802463678213, 0.238249561603253, 0.423188711313101, 0.123448, -0.0310019210668711, 0.0444211182227502, 0.200311923393836),
Arm = c('A','A','A','A','A','A','A','A','A','B','B','B','B','B','B','B','B','B'),
time = c('C1D1','C1D1','C1D1','surgery','surgery','surgery','30days_postop','30days_postop','30days_postop','C1D1','C1D1','C1D1','surgery','surgery','surgery','30days_postop','30days_postop','30days_postop'))


#This is the code I tried:
ggplot(median_score_df, aes(x = Arm, y = median_score, fill = time)) +
geom_boxplot(outlier.shape = NA) +
labs(x = "Arm", y = "score_to_test") +
geom_point(position = position_dodge(width = .75))

[This is so far what I did, I was able to get the dot as each patient but not able to get the lines connecting the datapoints]

enter image description here I am hoping and expecting to get something similar but have failed many times so far (https://i.stack.imgur.com/ND6Bq.png)

2

There are 2 best solutions below

0
Edward On

One option is to ignore the dodging and facet on Arm and then group on subject.

ggplot(median_score_df, aes(x = time, y = median_score, fill=time)) +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~Arm, labeller=label_both) +
  geom_point() +
  geom_line(aes(group=subject, col=subject), lty=2, show.legend=FALSE) +
  labs(x = "", y = "score_to_test") + 
  theme_bw() +
  theme(axis.text.x = element_blank())

enter image description here

0
jay.sf On

Using points and arrows after boxplot.

> boxplot(median_score ~ Arm, median_score_df, col=2:3)
> with(median_score_df, {
+   points(as.factor(Arm), median_score, pch=20)
+   arrows(1, median_score[Arm == 'A'], 2, median_score[Arm == 'B'], code=0)
+ })

enter image description here

Edit

We can also include an interaction with the time variable.

> ## create interaction table
> ia <- with(median_score_df, interaction(as.factor(Arm), time))
> tmp <- data.frame(n=as.integer(ia), 
+                   strsplit(as.character(ia), '\\.') |> do.call(what='rbind')) |> 
+   unique()
> tmp <- tmp[order(tmp$n), ]
> 
> ## plot w/ points
> boxplot(median_score ~ time + Arm, median_score_df, col=2:4)
> with(median_score_df, {
+   points(interaction(as.factor(Arm), time), median_score, pch=20)
+ })
> ## lines
> for (i in seq_len(nrow(tmp))[seq_len(nrow(tmp)) %% 2 == 1]) {
+   with(median_score_df,
+        arrows(i, median_score[Arm == tmp$X1[i] & time == tmp$X2[i]],
+               i + 1, median_score[Arm == tmp$X1[i + 1L] & time == tmp$X2[i + 1L]],
+               code=0)
+   )
+ }

enter image description here

Data:

> dput(median_score_df)
structure(list(subject = c("01", "02", "03", "01", "02", "03", 
"01", "02", "03", "04", "05", "06", "04", "05", "06", "04", "05", 
"06"), median_score = c(-0.0310019210668711, 0.0444211182227502, 
0.200311923393836, 0.221255710308368, 0.171509500650259, 0.196983984433688, 
0.236694770226112, 0.206757913002601, 0.211477853334867, 0.284073710074792, 
0.202858539728018, 0.147802463678213, 0.238249561603253, 0.423188711313101, 
0.123448, -0.0310019210668711, 0.0444211182227502, 0.200311923393836
), Arm = c("A", "A", "A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B", "B", "B", "B"), time = c("C1D1", "C1D1", 
"C1D1", "surgery", "surgery", "surgery", "30days_postop", "30days_postop", 
"30days_postop", "C1D1", "C1D1", "C1D1", "surgery", "surgery", 
"surgery", "30days_postop", "30days_postop", "30days_postop")), class = "data.frame", row.names = c(NA, 
-18L))