Add predictions for models by group

3k Views Asked by At

I'm estimating regressions models by groups in my dataset and then I wish to add the correct fitted values for all groups.

I'm trying the following:

library(dplyr)
library(modelr)

df <- tribble(
  ~year, ~country, ~value,
  2001, "France", 55, 
  2002, "France", 53, 
  2003, "France", 31, 
  2004, "France", 10, 
  2005, "France", 30, 
  2006, "France", 37, 
  2007, "France", 54, 
  2008, "France", 58, 
  2009, "France", 50, 
  2010, "France", 40, 
  2011, "France", 49, 
  2001, "USA", 55,
  2002, "USA", 53,
  2003, "USA", 64,
  2004, "USA", 40,
  2005, "USA", 30,
  2006, "USA", 39,
  2007, "USA", 55,
  2008, "USA", 53,
  2009, "USA", 71,
  2010, "USA", 44,
  2011, "USA", 40
)

rmod <- df %>% 
  group_by(country) %>% 
  do(fitModels = lm("value ~ year", data = .))

df <- df %>% 
  add_predictions(rmod)

which throws the error:

Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('rowwise_df', 'tbl_df', 'tbl', 'data.frame')"

I would like to get either one column with each of the fitted values for the country or one column with predictions per country. Somehow the add_predictions() function doesn't seem to work when the models are saved as a list after a do() call.

3

There are 3 best solutions below

1
On BEST ANSWER

There are a couple of additional ways you can attack this.

Probably the most direct, but you lose the intermediate model:

rmod <- df %>%
  group_by(country) %>%
  mutate(fit = lm(value ~ year)$fitted.values) %>%
  ungroup
rmod
# # A tibble: 22 × 4
#     year country value      fit
#    <dbl>   <chr> <dbl>    <dbl>
# 1   2001  France    55 38.13636
# 2   2002  France    53 39.00000
# 3   2003  France    31 39.86364
# 4   2004  France    10 40.72727
# 5   2005  France    30 41.59091
# 6   2006  France    37 42.45455
# 7   2007  France    54 43.31818
# 8   2008  France    58 44.18182
# 9   2009  France    50 45.04545
# 10  2010  France    40 45.90909
# # ... with 12 more rows

Another way uses a "tidy" model for enclosing data, models, and results into individual cells within the frame:

rmod <- df %>%
  group_by(country) %>%
  nest() %>%
  mutate(mdl = map(data, ~ lm(value ~ year, data=.))) %>%
  mutate(fit = map(mdl, ~ .$fitted.values))
rmod
# # A tibble: 2 × 4
#   country              data      mdl        fit
#     <chr>            <list>   <list>     <list>
# 1  France <tibble [11 × 2]> <S3: lm> <dbl [11]>
# 2     USA <tibble [11 × 2]> <S3: lm> <dbl [11]>

The advantage to this method is that you can, as needed, access other properties of the model as-needed, perhaps summary( filter(rmod, country == "France")$mdl[[1]] ). (The [[1]] is required because with tibbles, $mdl will always return a list.)

And you can extract/unnest it as follows:

select(rmod, -mdl) %>% unnest()
# # A tibble: 22 × 4
#    country      fit  year value
#      <chr>    <dbl> <dbl> <dbl>
# 1   France 38.13636  2001    55
# 2   France 39.00000  2002    53
# 3   France 39.86364  2003    31
# 4   France 40.72727  2004    10
# 5   France 41.59091  2005    30
# 6   France 42.45455  2006    37
# 7   France 43.31818  2007    54
# 8   France 44.18182  2008    58
# 9   France 45.04545  2009    50
# 10  France 45.90909  2010    40
# # ... with 12 more rows

(The columns are re-ordered, unfortunately, but that's aesthetic and easily remedied.)

EDIT

If you want/need to use modelr-specifics here, try:

rmod <- df %>%
  group_by(country) %>%
  nest() %>%
  mutate(mdl = map(data, ~ lm(value ~ year, data=.))) %>%
  mutate(fit = map(mdl, ~ .$fitted.values)) %>%
  mutate(data = map2(data, mdl, add_predictions))
rmod
# # A tibble: 2 x 4
#   country data              mdl      fit       
#   <chr>   <list>            <list>   <list>    
# 1 France  <tibble [11 x 3]> <S3: lm> <dbl [11]>
# 2 USA     <tibble [11 x 3]> <S3: lm> <dbl [11]>
select(rmod, -mdl, -fit) %>% unnest()
# # A tibble: 22 x 4
#    country  year value  pred
#    <chr>   <dbl> <dbl> <dbl>
#  1 France  2001.   55.  38.1
#  2 France  2002.   53.  39.0
#  3 France  2003.   31.  39.9
#  4 France  2004.   10.  40.7
#  5 France  2005.   30.  41.6
#  6 France  2006.   37.  42.5
#  7 France  2007.   54.  43.3
#  8 France  2008.   58.  44.2
#  9 France  2009.   50.  45.0
# 10 France  2010.   40.  45.9
# # ... with 12 more rows
0
On

Here is an alternative approach using the broom package instead of modelr. augment adds the fitted values as well as other useful info, such as the residuals to the original observations. It is designed to work perfectly with the output of a grouped model fit with do. See below:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(broom)

df <- tribble(
  ~year, ~country, ~value,
  2001, "France", 55, 
  2002, "France", 53, 
  2003, "France", 31, 
  2004, "France", 10, 
  2005, "France", 30, 
  2006, "France", 37, 
  2007, "France", 54, 
  2008, "France", 58, 
  2009, "France", 50, 
  2010, "France", 40, 
  2011, "France", 49, 
  2001, "USA", 55,
  2002, "USA", 53,
  2003, "USA", 64,
  2004, "USA", 40,
  2005, "USA", 30,
  2006, "USA", 39,
  2007, "USA", 55,
  2008, "USA", 53,
  2009, "USA", 71,
  2010, "USA", 44,
  2011, "USA", 40
)

rmod <- df %>% 
  group_by(country) %>% 
  do(fitModels = lm("value ~ year", data = .))

rmod %>% 
  augment(fitModels)
#> # A tibble: 22 x 10
#> # Groups:   country [2]
#>    country value  year .fitted .se.fit .resid   .hat .sigma .cooksd
#>    <chr>   <dbl> <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
#>  1 France    55. 2001.    38.1    8.49  16.9  0.318    14.2 0.430  
#>  2 France    53. 2002.    39.0    7.31  14.0  0.236    14.9 0.176  
#>  3 France    31. 2003.    39.9    6.25  -8.86 0.173    15.6 0.0438 
#>  4 France    10. 2004.    40.7    5.37 -30.7  0.127    10.9 0.349  
#>  5 France    30. 2005.    41.6    4.76 -11.6  0.1000   15.4 0.0366 
#>  6 France    37. 2006.    42.5    4.54  -5.45 0.0909   15.8 0.00723
#>  7 France    54. 2007.    43.3    4.76  10.7  0.100    15.5 0.0311 
#>  8 France    58. 2008.    44.2    5.37  13.8  0.127    15.1 0.0705 
#>  9 France    50. 2009.    45.0    6.25   4.95 0.173    15.8 0.0137 
#> 10 France    40. 2010.    45.9    7.31  -5.91 0.236    15.8 0.0313 
#> # ... with 12 more rows, and 1 more variable: .std.resid <dbl>

Created on 2018-04-19 by the reprex package (v0.2.0).

0
On

I would do the following with data.table:

library(data.table)
setDT(df) # convert to data.table
df[ , value_hat := lm(value ~ year)$fitted.values, by = country]

If you've got NAs, one option is:

df[complete.cases(df), value_hat := lm(value ~ year)$fitted.values, by = country]

and another actually uses predict:

df[ , value_hat := predict(lm(value ~ year), .SD), by = country]