Select and apply correct model from different data frame using purrr

84 Views Asked by At

In my data, I have correlated data (diet and liver) for 50+ different compounds (simplified here).

library(tidyverse)
Sigma <- matrix(.7, nrow=6, ncol=6) + diag(6)*.3
vars_tr <- data.frame(MASS::mvrnorm(n=10, mu=c(2:7), Sigma=Sigma))

tr<-tibble(
  compound=c(rep("A", 10), rep("B", 10), rep("C",10)),
  diet=c(vars_tr$X1, vars_tr$X2, vars_tr$X3),
  liver=c(vars_tr$X4, vars_tr$X5, vars_tr$X6))

Following the guidance on doing regressions for multiple models, I created a nested data frame and stored the output (learning this method this week was a lifesaver!).

model<-function(df){lm(data=df, liver~diet)}

mods<- tr %>%
  group_by(compound) %>%
  nest() %>%
  mutate(model=map(data, model))

Now I have new 'diet' data for which no 'liver' data exists.

new<-tibble(
  compound=c(rep("A", 10), rep("B", 10), rep("C",10)),
  diet=c(rnorm(10, 4), rnorm(10, 5), rnorm(10,6)))

What I would like to do is take advantage of purrr generate a liver concentration for each diet concentration using the correct model for the compound. My best attempt looks like:

preds<-function(c, x){    
  add_predictions(tibble(diet=x), filter(mods, compound==c)$model[[1]], 'liver')$liver
}

new%>%
  mutate(liver=map2(compound, diet, preds))

which returns an error.

I would greatly appreciate any help!

EDIT 6/4/2020:

Based on the helpful comments from Bruno and Ronak Shah below, I've made some progress but haven't found the solution. Both suggest joining the models to the existing table, which makes way more sense than what I was doing.

Based on that, it is relatively simple to do the following:

new_mods<-
  new%>%
  group_by(compound)%>%
  nest()%>%
  left_join(., select(mods_d, compound, model), , by='compound')%>%
  mutate(predicts = map2(data, model, add_predictions))%>%
  unnest(predicts)
2

There are 2 best solutions below

3
On

You can create a function for prediction :

preds<-function(data, mod){   
   modelr::add_predictions(data, mod)$liver
}

nest the dataframe for each compound, join with mods and apply the respective model for each group of data.

library(dplyr)
new %>%
   tidyr::nest(data = diet) %>%
   left_join(mods, by = 'compound') %>%
   mutate(liver = purrr::map2(data.y, model, preds))


# A tibble: 3 x 5
#  compound data.x            data.y            model  liver     
#  <chr>    <list>            <list>            <list> <list>    
#1 A        <tibble [10 × 1]> <tibble [10 × 2]> <lm>   <dbl [10]>
#2 B        <tibble [10 × 1]> <tibble [10 × 2]> <lm>   <dbl [10]>
#3 C        <tibble [10 × 1]> <tibble [10 × 2]> <lm>   <dbl [10]>

Based on the requirement you can select relevant columns and unnest the results if needed.

2
On

You can use a join operation and keep working on tibbles

library(tidyverse)
library(MASS)

Sigma <- matrix(.7, nrow=6, ncol=6) + diag(6)*.3
vars_tr <- data.frame(mvrnorm(n=10, mu=c(2:7), Sigma=Sigma))

tr<-tibble(
  compound=c(rep("A", 10), rep("B", 10), rep("C",10)),
  diet=c(vars_tr$X1, vars_tr$X2, vars_tr$X3),
  liver=c(vars_tr$X4, vars_tr$X5, vars_tr$X6))

model<-function(df){lm(data=df, liver~diet)}

mods<- tr %>%
  nest_by(compound) %>% 
  mutate(model = list(model(data)))

new<-tibble(
  compound=c(rep("A", 10), rep("B", 10), rep("C",10)),
  diet=c(rnorm(10, 4), rnorm(10, 5), rnorm(10,6)))

new_nest <- new %>% 
  nest_by(compound)

results <- mods %>% 
  left_join(new_nest,by = "compound") %>% 
  mutate(predicts = list(predict(model,data.y)))