betareg on skewed proportions leads to a high predicted mean

224 Views Asked by At

I'm running a beta regression on proportions which are highly skewed (median ~2%, mean ~13%). However, the mean predicted value coming out of the model is typically about twice as large as the sample mean. Even the minimum predicted value is larger than the sample mean! I've tried every variation of link & link.phi functions and the result remains the same.

Any suggestions on how I can get a better match between the predicted and sample distributions? Thanks!

library(betareg)

data <- structure(list(y = c(0.09053397, 0.02396534, 0.0618883, 1.9e-07, 
                             0.07121946, 1.9e-07, 1.9e-07, 0.02395879, 0.01905196, 0.06781774, 
                             1.9e-07, 1.9e-07, 0.99999981, 1.9e-07, 1.9e-07, 0.19839486, 1.9e-07, 
                             0.05087847, 1.9e-07, 0.00257154, 0.08083247, 0.02506733, 1.9e-07, 
                             1.9e-07, 0.19138035, 0.11433343, 0.02341815, 0.99999981, 0.01890199, 
                             1.9e-07, 1.9e-07, 0.09462199, 1.9e-07, 0.62029209, 0.01332234, 
                             1.9e-07, 1.9e-07, 0.00053404, 1.9e-07, 0.03991713, 0.05449004, 
                             0.39588225, 1.9e-07, 1.9e-07, 0.21705814, 0.01063151, 0.08353221, 
                             0.00856108, 0.11422871, 0.99999981, 0.04512786, 0.12043445, 1.9e-07, 
                             0.01948717, 0.14161796, 0.00139597, 1.9e-07, 0.12162885, 0.79714543, 
                             0.24379762, 1.9e-07, 0.64810177, 0.21043641, 0.12024897, 1.9e-07, 
                             0.51773257, 0.99999981, 1.9e-07, 0.33720735, 1.9e-07, 0.03963053, 
                             0.00405092, 0.04025961, 0.16409692, 1.9e-07, 0.64428465, 1.9e-07, 
                             1.9e-07, 0.07384747, 1.9e-07, 0.01558129, 0.0366185, 0.03218959, 
                             0.01835648, 0.13076873, 1.9e-07, 0.03025186, 0.0914969, 0.00021478, 
                             1.9e-07, 1.9e-07, 1.9e-07, 0.02444584, 0.02189736, 0.097028, 
                             0.01347678, 1.9e-07, 0.63884538, 0.09762437, 0.5166029), 
                       x = c(0.07825506, 0.08064626, 0.08514504, 1.9e-07, 1.9e-07, 1.9e-07, 1.9e-07, 0.00990069, 
                              1.9e-07, 0.08254355, 0.87667832, 1.9e-07, 0.64975944, 1.9e-07, 
                              1.9e-07, 0.22367289, 1.9e-07, 0.04823323, 1.9e-07, 0.43635115, 
                              0.08165183, 0.02220256, 1.9e-07, 1.9e-07, 1.9e-07, 0.01418114, 
                              0.01097939, 1.9e-07, 1.9e-07, 0.03326785, 1.9e-07, 0.06957586, 
                              1.9e-07, 0.25541391, 0.32377859, 0.23855679, 0.02837065, 0.06388796, 
                              1.9e-07, 1.9e-07, 0.05840598, 0.01999456, 1.9e-07, 1.9e-07, 0.40921674, 
                              1.9e-07, 0.002533, 0.04552465, 0.1046386, 0.75826368, 0.40296593, 
                              1.9e-07, 0.02852656, 0.05545389, 1.9e-07, 0.00030394, 0.01199183, 
                              0.21549378, 0.05438194, 0.03442961, 0.15914659, 0.20376901, 0.09622411, 
                              0.38810661, 1.9e-07, 0.90504166, 1.9e-07, 0.00803977, 0.03232884, 
                              1.9e-07, 0.02517311, 0.00013941, 0.11415697, 1.9e-07, 1.9e-07, 
                              0.42301297, 1.9e-07, 1.9e-07, 0.05489811, 1.9e-07, 0.3773369, 
                              0.23680447, 0.01163705, 0.01191086, 0.28056447, 1.9e-07, 0.0030056, 
                              0.03705583, 0.0637212, 1.9e-07, 0.00752029, 1.9e-07, 0.10052723, 
                              0.04053147, 0.02817251, 0.0165664, 1.9e-07, 1.9e-07, 0.01264018, 
                              0.018953)), 
                  row.names = c(NA, -100L), 
                  class = c("tbl_df", "tbl", "data.frame"))


model <- betareg(formula = "y ~ x", data = data)
data$pred <- predict(model, newdata = data)

summary(data)
       y                   x                  pred       
 Min.   :0.0000002   Min.   :0.0000002   Min.   :0.1854  
 1st Qu.:0.0000002   1st Qu.:0.0000002   1st Qu.:0.1854  
 Median :0.0236885   Median :0.0153738   Median :0.1918  
 Mean   :0.1277322   Mean   :0.0957217   Mean   :0.2366  
 3rd Qu.:0.1142549   3rd Qu.:0.0818748   3rd Qu.:0.2216  
 Max.   :0.9999998   Max.   :0.9050417   Max.   :0.7300
2

There are 2 best solutions below

0
On

To help you answer your questions, we can train 5 models and plot the predictions as in:

library(quantreg)
library(betareg)
mod1 <- betareg(formula = y ~ x, data = data, link = "logit")
pred1 <- predict(object = mod1, newdata = data, type = "response")
mod2 = loess(y ~ x, data = data)
pred2 = predict(mod2, data$x)
mod3 = lm(y ~ x, data = data)
mod4 <- rq(y ~ x, data = data)
x = data$x; y = data$y
y2 = y[y < 0.95]; x2 = x[y < 0.95]
mod5 = betareg(formula = y ~ x, data = data.frame(x = x2, y = y2), link = "logit")
pred5 = predict(object = mod5, newdata = data.frame(x = data$x), type = "response")
plot(data$x, data$y, xlab = "x", ylab = "y", pch = 19)
lines(sort(data$x), pred1[order(data$x)], type = "l", lty = 2)
lines(sort(data$x), pred2[order(data$x)], type = "l", col = "red2", lty = 2)
abline(mod3, col = "blue", lty = 2)
abline(mod4, col = "green", lty = 2)
lines(sort(data$x), pred5[order(data$x)], type = "l", col = "brown", lty = 2)

output plot

In blue, you have the linear model, while in red you have the loess model. As you can see, these tend to follow the conditional mean, E[Y|X]. In green you have the quantile regression that tends to follow the conditional median, median(Y|X). Finally, in black, you have the betareg model and in brown the beta regression after removing y > 0.95. Why such a dramatic effect by removing 4 points only? In beta regression, the variance of y is highly heteroscedastic (maximum variance at y = 0.5, variance equal to zero at y = 0 and y = 1). Those 4 observations with y = 0.99999981 have almost no variance. These are so certain that push the whole prediction line, upwards. By removing them, you get what you want.

1
On

You could remove the outliers with a multivariate method like the Mahalanobis distance. Try this:

library(dplyr)

Sx <- cov(data)
D2 <- mahalanobis(data, colMeans(data), Sx)
 
# Number of outliers to remove
sum(D2 > qchisq(0.001,ncol(data),lower.tail=FALSE)
 
data2 <- filter(data,D2 <= qchisq(0.001,ncol(data),lower.tail=FALSE))
model <- betareg(formula = "y ~ x", data = data2)

data$pred <- predict(model, newdata = data)

summary(data)

Output:

#       y                   x                  pred        
# Min.   :0.0000002   Min.   :0.0000002   Min.   :0.06280  
# 1st Qu.:0.0000002   1st Qu.:0.0000002   1st Qu.:0.06280  
# Median :0.0236885   Median :0.0153738   Median :0.06638  
# Mean   :0.1277322   Mean   :0.0957217   Mean   :0.10657  
# 3rd Qu.:0.1142549   3rd Qu.:0.0818748   3rd Qu.:0.08413  
# Max.   :0.9999998   Max.   :0.9050417   Max.   :0.68639