Wrong letters of ANOVA+tukey with multcompView Package and multcompLetters4 function

2.5k Views Asked by At

I am trying to visualize my statistical measures of ANOVA and post-hoc Tukey in a barplot. This worked out so far, but my letters are in the wrong order, the bar "mit Downsampling" (2) should be the one differnet to the others and not the first one "ohne Sampling. Example Visualisation

The code I am using:

library(ggplot2)
library(multcompView)
library(reshape2)
a.lda <- read.table(header = TRUE, text = "  rowname ohne_Sampling mit_Downsampling mit_Upsampling Gewichtung
                          1   Fold1     0.6732673        0.8390805      0.7192982  0.6732673
                          2   Fold2     0.7227723        0.8181818      0.7105263  0.7227723
                          3   Fold3     0.7100000        0.7586207      0.6842105  0.7100000
                          4   Fold4     0.6633663        0.8295455      0.7105263  0.6633663
                          5   Fold5     0.7128713        0.8750000      0.7017544  0.7128713")

#Transformation of the dataframe to get a format ggplot2 is able to use    
a.lda <- melt(a.lda, id.vars="rowname")
   
#data_summary Function
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE),
      minimum = min(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}

a.sd.lda <- data_summary(a.lda, varname = "value", groupnames = "variable")

#ANOVA+Tuckey
a.anova <- aov(data=a.lda, value ~ variable)
tukey <- TukeyHSD(a.anova)
cld <- as.data.frame.list((multcompLetters4(a.anova,tukey))$variable)
#The wrong letters do already appear here
a.sd.lda$cld <- cld$Letters

So by checking the a.sd.lda table one can already see the wrong letters as a,b,b,b instead of a,b,a,a. Also by checking the tukey results, there is NO significant difference between ohne Sampling, mit Upsampling and Gewichtung. So I guess the multcompLetters4() function is causing the misorder.

I would be so thankful for any suggestions!!!

Searching for an answer I found this stackoverflow entry (Wrong Tukey-letter ordering in R multcompView package) but none of the answers did solve my problem.

Just to round things up, this is the code for the visualisation, although the mistake in my code has to be above

#Visualization
ldaplot <- ggplot(a.sd.lda, aes(variable,value,fill=variable))+
      labs(title="LDA")+
      scale_x_discrete(guide = guide_axis(n.dodge=2))+
      coord_cartesian(ylim=c(y_min,1))+
      geom_bar(stat="identity", color="black", 
               position=position_dodge()) +
      scale_fill_brewer(palette="YlOrBr")+
      geom_text(data = a.sd.lda, aes(x = variable, y = value, label = cld), size = 5, vjust=-.5, hjust=-.7)+
      geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                    position=position_dodge(.9))+ 
      labs(x="", y="Accuracy")+
      geom_abline(aes(intercept=Akzeptanzwert,slope=0), color="red")

The source code of multcompLetter4() function and related ones is available here: https://rdrr.io/cran/multcompView/f/

2

There are 2 best solutions below

0
On

Hmm I see you have already solved your own problem, but here is my solution anyway:

a.lda <- tibble::tribble(
     ~Fold,               ~Var,      ~Val,
   "Fold1",    "ohne_Sampling", 0.6732673,
   "Fold2",    "ohne_Sampling", 0.7227723,
   "Fold3",    "ohne_Sampling",      0.71,
   "Fold4",    "ohne_Sampling", 0.6633663,
   "Fold5",    "ohne_Sampling", 0.7128713,
   "Fold1", "mit_Downsampling", 0.8390805,
   "Fold2", "mit_Downsampling", 0.8181818,
   "Fold3", "mit_Downsampling", 0.7586207,
   "Fold4", "mit_Downsampling", 0.8295455,
   "Fold5", "mit_Downsampling",     0.875,
   "Fold1",   "mit_Upsampling", 0.7192982,
   "Fold2",   "mit_Upsampling", 0.7105263,
   "Fold3",   "mit_Upsampling", 0.6842105,
   "Fold4",   "mit_Upsampling", 0.7105263,
   "Fold5",   "mit_Upsampling", 0.7017544,
   "Fold1",       "Gewichtung", 0.6732673,
   "Fold2",       "Gewichtung", 0.7227723,
   "Fold3",       "Gewichtung",      0.71,
   "Fold4",       "Gewichtung", 0.6633663,
   "Fold5",       "Gewichtung", 0.7128713
  )


library(tidyverse)
library(dlookr)
library(emmeans)
library(multcomp)
library(multcompView)
library(scales)

# data summary ------------------------------------------------------------
a.sd.lda <- a.lda %>%
  group_by(Var) %>%
  dlookr::describe(Val) %>%
  ungroup()

a.sd.lda
#> # A tibble: 4 x 27
#>   variable Var            n    na  mean     sd se_mean     IQR skewness kurtosis
#>   <chr>    <chr>      <int> <int> <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
#> 1 Val      Gewichtung     5     0 0.696 0.0264 0.0118  0.0396    -0.536    -2.63
#> 2 Val      mit_Downs~     5     0 0.824 0.0423 0.0189  0.0209    -0.798     1.79
#> 3 Val      mit_Upsam~     5     0 0.705 0.0133 0.00595 0.00877   -1.12      1.46
#> 4 Val      ohne_Samp~     5     0 0.696 0.0264 0.0118  0.0396    -0.536    -2.63
#> # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
#> #   p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
#> #   p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
#> #   p100 <dbl>


# compact letter display --------------------------------------------------
emmcld <- lm(Val ~ Var, data = a.lda) %>%
  emmeans(~ Var) %>%
  cld(Letters = letters, adjust = "Tukey")

I would argue that this is all the code needed for getting descriptive statistics and the compact letter display from the Tukey's test.

Note that for the plots I am only using the raw data a.lda and the result output emmcld:

reproducing your plot

ggplot() +
  # y-axis
  scale_y_continuous(
    name = "Accuracy",
    limits = c(NA, 1),
    breaks = pretty_breaks(),
    expand = expansion(mult = c(0, 0))
  ) +
  # x-axis
  scale_x_discrete(name = NULL) +
  # general layout
  theme_classic() +
  # colors
  scale_fill_brewer(palette = "YlOrBr") +
  # horizontal line
  geom_hline(yintercept = 0.9, color = "red") +
  # bars
  geom_bar(
    data = emmcld,
    aes(y = emmean, x = Var, fill = Var),
    color = "black",
    stat = "identity"
  ) +
  # errorbars
  geom_errorbar(data = emmcld,
                aes(
                  ymin = emmean - SE,
                  ymax = emmean + SE,
                  x = Var
                ),
                width = 0.1) +
  # letters
  geom_text(
    data = emmcld,
    aes(
      y = emmean + SE,
      x = Var,
      label = str_trim(.group)
    ),
    hjust = 0.5,
    vjust = -0.5
  ) +
  # caption
  labs(
    caption = str_wrap(
      "Bars with errorbars represent (estimated marginal) means ± standard error. Means not sharing any letter are significantly different by the Tukey-test at the 5% level of significance.",
      width = 90
    )
  )

alternative plot suggestion

..because some people don't like "dynamite plots" as nicely described in this blogpost

# optional: sort factor levels of groups column according to highest mean
# ...in means table
emmcld <- emmcld %>%
  mutate(Var = fct_reorder(Var, emmean))

# ...in data table
a.lda <- a.lda %>%
  mutate(Var = fct_relevel(Var, levels(emmcld$group)))

# base plot setup
ggplot() +
  # y-axis
  scale_y_continuous(limits = c(NA, 1),
                     name = "Accuracy",
                     breaks = pretty_breaks()) +
  # x-axis
  scale_x_discrete(name = NULL) +
  # general layout
  theme_classic() +
  # colors
  scale_fill_brewer(palette = "YlOrBr", guide = "none") +
  scale_color_brewer(palette = "YlOrBr", guide = "none") +
  # horizontal line
  geom_hline(yintercept = 0.9,
             color = "red",
             linetype = "dashed") +
  # black data points
  geom_point(
    data = a.lda,
    aes(y = Val, x = Var, fill = Var),
    shape = 21,
    alpha = 0.9,
    position = position_nudge(x = -0.2)
  ) +
  # black boxplot
  geom_boxplot(
    data = a.lda,
    aes(y = Val, x = Var, fill = Var),
    width = 0.05,
    outlier.shape = NA,
    position = position_nudge(x = -0.1)
  ) +
  # red mean value
  geom_point(data = emmcld,
             aes(y = emmean, x = Var),
             size = 2,
             color = "red") +
  # red mean errorbar
  geom_errorbar(
    data = emmcld,
    aes(ymin = lower.CL, ymax = upper.CL, x = Var),
    width = 0.05,
    color = "red"
  ) +
  # red letters
  geom_text(
    data = emmcld,
    aes(
      y = emmean,
      x = Var,
      label = str_trim(.group)
    ),
    position = position_nudge(x = 0.1),
    hjust = 0,
    color = "red"
  ) +
  # caption
  labs(
    caption = str_wrap(
      "Black dots represent raw data. Red dots and error bars represent (estimated marginal) means ± 95% confidence interval per group. Means not sharing any letter are significantly different by the Tukey-test at the 5% level of significance.",
      width = 90
    )
  )

Created on 2022-01-27 by the reprex package (v2.0.1)

0
On

Okay after trying A LOT I found a way to solve this problem by another package:

multcomp

So here is my code to get the right results and plot:

library(ggplot2)
library(multcomp)
library(reshape2)


a.lda <- read.table(header = TRUE, text = "  rowname ohne_Sampling mit_Downsampling mit_Upsampling Gewichtung
                          1   Fold1     0.6732673        0.8390805      0.7192982  0.6732673
                          2   Fold2     0.7227723        0.8181818      0.7105263  0.7227723
                          3   Fold3     0.7100000        0.7586207      0.6842105  0.7100000
                          4   Fold4     0.6633663        0.8295455      0.7105263  0.6633663
                          5   Fold5     0.7128713        0.8750000      0.7017544  0.7128713")

#Transformation of the dataframe to get a format ggplot2 is able to use    
a.lda <- melt(a.lda, id.vars="rowname")

#data_summary Function
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE),
      minimum = min(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}

a.sd.lda <- data_summary(a.lda, varname = "value", groupnames = "variable")


#ANOVA+Tuckey **NEW**
a.anova <- aov(data=a.lda, value ~ variable)
tukey <- glht(a.anova, linfct = mcp(variable = "Tukey"))
tuk.cld <- cld(tukey)
a.sd.lda$cld <- tuk.cld$mcletters$Letters

#Visualization
ldaplot <- ggplot(a.sd.lda, aes(variable,value,fill=variable))+
  labs(title="LDA")+
  scale_x_discrete(guide = guide_axis(n.dodge=2))+
  coord_cartesian(ylim=c(y_min,1))+
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  scale_fill_brewer(palette="YlOrBr")+
  geom_text(data = a.sd.lda, aes(x = variable, y = value, label = cld), size = 5, vjust=-.5, hjust=-.7)+
  geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                position=position_dodge(.9))+ 
  labs(x="", y="Accuracy")+
  geom_abline(aes(intercept=Akzeptanzwert,slope=0), color="red")

idea for this solution came from: https://stats.stackexchange.com/questions/31547/how-to-obtain-the-results-of-a-tukey-hsd-post-hoc-test-in-a-table-showing-groupe

Hopefully it helps other R-users as well :)