Adding statistical comparison to boxplot on R

50 Views Asked by At

enter image description hereI have data in a CSV file that looks like this (small sample of the data)

I am trying to create boxplots that include all the brain regions (TCV, cGMV..etc) on a multipanel graph that have boxplots. The boxplots are the regional volumes by control, 16pdeletion, and 16pduplication. I am then trying to include statistical analysis on the graphs as well. I have attached an image that looks similar for reference.

I am new to coding and with Google and AI I generated the following code in R:

The problem is the above code does not put any statistical comparison information on the graph. It does generate the boxplots. but I want ANOVA comparisons between all the groups displayed on the graph similar to the image I posted. Is this possible? I am really struggling with this. Thanks.

Data:

structure(list(Genotype = c("16pDeletion", "16pDeletion", "16pDuplication", "16pDeletion", "16pDeletion", "16pDeletion", "16pDeletion", "16pDuplication", "16pDuplication", "16pDuplication", "16pDuplication", "16pDuplication", "16pDeletion", "16pDeletion", "16pDuplication", "16pDuplication", "16pDeletion", "16pDeletion", "16pDuplication", "16pDuplication", "16pDeletion", "16pDeletion", "16pDeletion", "16pDuplication", "16pDuplication", "16pDuplication", "16pDeletion", "16pDeletion", "16pDuplication", "16pDuplication", "16pDuplication", "16pDuplication", "16pDeletion", "16pDeletion", "16pDuplication", "16pDuplication", "16pDuplication", "16pDuplication", "16pDuplication", "16pDuplication" ), TCV = c(1585123.4, 1613451.5, 1254852.6, 1672535.2, 1743582.2, 1491539.2, 1310182.8, 1376477, 1518732.2, 1614716, 1364261.8, 1319865.5, 1605215.9, 1604086.1, 1327282, 1332532.8, 1762358.8, 1708869, 1314523.5, 1412963.8, 1590316.5, 1778252, 1351071.8, 1355172.2, 1185749.8, 1135614.2, 1723856.5, 1558734.2, 1584290.8, 1187461.2, 1686037.1, 1340202.2, 1512908.8, 1730947.2, 1251666.4, 1218439.9, 1509944.5, 1333044.8, 1261850.2, 1430791), cGMV = c(634281.02, 667125.74, 523586.46, 681493.38, 718549.17, 533610.78, 529920.28, 567721.87, 620806.74, 635879.38, 568125.09, 487745.9, 604085.5, 632492.97, 562692.66, 517269.49, 625686.54, 679801.74, 544064.09, 591924.62, 603401, 735458.59, 538595.1, 546504.5, 485947.5, 463428.58, 607075.52, 590247.36, 657749.36, 486387.97, 647613.12, 511771.85, 623306.3, 712700.41, 491602.53, 463851.89, 548518.44, 490503.23, 467152.14, 539817.18), sGMV = c(54005.866, 56263.346, 45464.477, 55852.579, 59040.943, 50403.588, 52003.22, 48442.795, 52521.091, 48066.023, 50907.81, 46945.817, 51080.886, 55125.4, 47064.498, 44495.517, 61462.138, 61819.922, 47180.803, 50151.131, 56884.133, 59579.308, 47772.089, 48106.353, 44077.454, 43709.87, 60158.103, 56175.14, 59401.195, 45716.46, 60167.19, 43443.51, 55322.688, 60513.485, 43518.648, 43559.929, 48686.507, 54186.351, 45426.282, 49442.544), WMV = c(477094.94, 501540.9, 371206.73, 544797.84, 556696.6, 465324.94, 400483.71, 403904.21, 460463.89, 451927.33, 382444.42, 380737.16, 475837.9, 506064.12, 396907.67, 391020.39, 572274.54, 563103.9, 399182.65, 435212, 517541.52, 564022.58, 409654.84, 396609.52, 347824.5, 330440.4, 566146, 515662.5, 483029.31, 341286.35, 525666.81, 367518.22, 483672.23, 567484.56, 394566.67, 384788.83, 453152, 410909.78, 395765.05, 452255.66), Ventricles = c(20477.715, 10007.924, 17254.119, 15667.753, 8610.41, 18057.775, 7195.883, 14113.144, 16395.578, 63316.462, 9978.456, 25648.362, 47716.005, 9725.05, 13371.041, 25302.91, 31296.017, 14596.339, 7510.771, 18296.417, 10193.693, 11052.107, 9063.024, 19868.096, 15642.802, 5676.129, 24458.093, 9994.988, 11558.973, 5600.151, 33728.745, 44752.145, 9407.658, 12884.087, 9742.305, 9158.799, 22873.491, 25129.613, 10728.438, 12664.467)), row.names = c(NA, 40L), class = "data.frame")

#Open Libraries 
library(ggplot2)
library(dplyr)
library(tidyr)
library(car)
library(ggpubr)

# Read in the data
data <- read.csv("SynthSegQC.csv")

# Manually specify the columns for the regions of interest
regions_of_interest <- c("TCV", "cGMV", "sGMV", "WMV", "Ventricles" ) 

# Reshape the data to long format
data_long <- pivot_longer(data, cols = all_of(regions_of_interest), names_to = "Region", values_to = "Volume")

# Function to perform ANOVA and return p-values
get_p_value <- function(data, region) {
  model <- aov(Volume ~ Genotype, data = data[data$Region == region,])
  p_value <- summary(model)[[1]]$"Pr(>F)"[1]
  return(p_value)
}

# Add a column with p-values for each region
data_long <- data_long %>%
  group_by(Region) %>%
  mutate(P_Value = get_p_value(data_long, Region))

# Function to format p-values for display
format_p_value <- function(p) {
  ifelse(p < 0.001, "<0.001", sprintf("%.3f", p))
}

# Ensure that the p-values are calculated outside of ggplot2 aesthetics
data_long$Formatted_P_Values <- format_p_value(data_long$P_Value)

# Plotting with facets
ggplot(data_long, aes(x = Genotype, y = Volume, fill = Genotype)) +
  geom_boxplot() +
  geom_jitter(aes(color = Genotype), width = 0.2, size = 1, alpha = 0.5) +
  facet_wrap(~Region, scales = "free") +
  theme_minimal() +
  labs(title = "Brain Region Volumes by Genotype", x = "Genotype", y = "Volume") +
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = Formatted_P_Values, y = Inf), vjust = -0.5, size = 2.5)
0

There are 0 best solutions below