In a df with multiple observations for each ID, how to conditionally find date according to another variable?

319 Views Asked by At

This is the first question I ask on here, I hope to do this correctly!

I have a dataset with million of observations. Each row is a drug prescription picked up by different individuals on different dates, with each individual appearing multiple times in the dataframe.

library(dplyr)

set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% arrange(ID) %>% print(n=40)

#> # A tibble: 40 x 3
#> # Groups:   ID [10]
#>    ID    prescription_date switch
#>    <fct> <date>             <dbl>
#>  1 ID1   2007-03-01             1
#>  2 ID1   1999-06-01             0
#>  3 ID1   1999-02-01             1
#>  4 ID1   2006-09-01             0
#>  5 ID10  2008-08-01             0
#>  6 ID10  2000-09-01             1
#>  7 ID10  2001-09-01             1
#>  8 ID10  2001-11-01             1
#>  9 ID10  2000-04-01             1
#> 10 ID10  2004-09-01             1
#> 11 ID2   2008-10-01             1
#> 12 ID2   2003-01-01             0
#> 13 ID2   2005-12-01             0
#> 14 ID2   2000-06-01             0
#> 15 ID3   2007-07-01             1
#> 16 ID3   2007-11-01             0
#> 17 ID4   1999-03-01             1
#> 18 ID4   2003-10-01             0
#> 19 ID4   1999-05-01             1
#> 20 ID4   2007-10-01             1
#> 21 ID4   2005-04-01             0
#> 22 ID4   2009-05-01             1
#> 23 ID4   2005-10-01             0
#> 24 ID4   2003-07-01             0
#> 25 ID5   2008-06-01             1
#> 26 ID5   2002-04-01             1
#> 27 ID5   2005-01-01             0
#> 28 ID5   2001-05-01             0
#> 29 ID5   2009-09-01             1
#> 30 ID6   2006-08-01             0
#> 31 ID6   2000-12-01             0
#> 32 ID7   2007-06-01             0
#> 33 ID8   2008-11-01             1
#> 34 ID8   1999-09-01             0
#> 35 ID8   2007-05-01             0
#> 36 ID8   2009-03-01             1
#> 37 ID9   2009-10-01             0
#> 38 ID9   1999-10-01             1
#> 39 ID9   2007-04-01             0
#> 40 ID9   2008-01-01             0

Created on 2021-06-19 by the reprex package (v0.3.0)

The variable "switch" indicates whether the individual switched drug in that prescription in respect with the prior prescription. I need to know the date in which each individual switched drug for the third time. However I am having a hard time since I can't seem to create an iterative summation of the variable "switch" over each observation. It would be enough to manage to create something similar to this:


#> # A tibble: 40 x 3
#> # Groups:   ID [10]
#>    ID    prescription_date switch date3switch
#>    <fct> <date>             <dbl>       <dbl>
#>  1 ID1   1999-02-01             1           1
#>  2 ID1   1999-06-01             0           NA
#>  3 ID1   2006-09-01             0           NA
#>  4 ID1   2007-03-01             1           2
#>  5 ID10  2000-04-01             1           1
#>  6 ID10  2000-09-01             1           2
#>  7 ID10  2001-09-01             1           3
#>  8 ID10  2001-11-01             1           4
#>  9 ID10  2004-09-01             1           5
#> 10 ID10  2008-08-01             0          NA
#> 11 ID2   2000-06-01             0          NA
#> 12 ID2   2003-01-01             0          NA
#> 13 ID2   2005-12-01             0          NA
#> 14 ID2   2008-10-01             1           1
#> 15 ID3   2007-07-01             1           1
#> 16 ID3   2007-11-01             0          NA
#> 17 ID4   1999-03-01             1           1
#> 18 ID4   1999-05-01             1           2
#> 19 ID4   2003-07-01             0          NA
#> 20 ID4   2003-10-01             0          NA
#> 21 ID4   2005-04-01             0          NA
#> 22 ID4   2005-10-01             0          NA
#> 23 ID4   2007-10-01             1           3
#> 24 ID4   2009-05-01             1           4

I tried creating a for loop, but I guess it's too advanced for my beginner skills, because I only manage to create a NULL dataframe..

df <- for (i in 1:dim(df)[1]) {
  if(sum(data$switch) == 3) 
  { mutate(date3switch == prescribed_date)}
  else NA
  }

Created on 2021-06-19 by the reprex package (v0.3.0)

I appreciate your help!

3

There are 3 best solutions below

1
On BEST ANSWER

Using cumsum will be helpful and replace the value where switch = 0 to NA.

library(dplyr)

df %>%
  arrange(ID) %>%
  group_by(ID) %>%
  mutate(date3switch = cumsum(switch), 
         date3switch = replace(date3switch, switch == 0, NA)) %>%
  ungroup

#    ID    prescription_date switch date3switch
#   <chr> <date>             <dbl>       <dbl>
# 1 ID1   2007-03-01             1           1
# 2 ID1   1999-06-01             0          NA
# 3 ID1   1999-02-01             1           2
# 4 ID1   2006-09-01             0          NA
# 5 ID10  2008-08-01             0          NA
# 6 ID10  2000-09-01             1           1
# 7 ID10  2001-09-01             1           2
# 8 ID10  2001-11-01             1           3
# 9 ID10  2000-04-01             1           4
#10 ID10  2004-09-01             1           5
# … with 30 more rows
0
On

We can use na_if

library(dplyr)
df %>%
   arrange(ID) %>%
   group_by(ID) %>%
   mutate(date3switch = na_if(cumsum(switch), 0))
0
On
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)



df %>% group_by(ID) %>%
  arrange(prescription_date, .by_group = T) %>%
  mutate(switch2 = ifelse(switch == 0, NA, cumsum(switch))) %>%
  print(n = 40)
#> # A tibble: 40 x 4
#> # Groups:   ID [10]
#>    ID    prescription_date switch switch2
#>    <chr> <date>             <dbl>   <dbl>
#>  1 ID1   1999-02-01             1       1
#>  2 ID1   1999-06-01             0      NA
#>  3 ID1   2006-09-01             0      NA
#>  4 ID1   2007-03-01             1       2
#>  5 ID10  2000-04-01             1       1
#>  6 ID10  2000-09-01             1       2
#>  7 ID10  2001-09-01             1       3
#>  8 ID10  2001-11-01             1       4
#>  9 ID10  2004-09-01             1       5
#> 10 ID10  2008-08-01             0      NA
#> 11 ID2   2000-06-01             0      NA
#> 12 ID2   2003-01-01             0      NA
#> 13 ID2   2005-12-01             0      NA
#> 14 ID2   2008-10-01             1       1
#> 15 ID3   2007-07-01             1       1
#> 16 ID3   2007-11-01             0      NA
#> 17 ID4   1999-03-01             1       1
#> 18 ID4   1999-05-01             1       2
#> 19 ID4   2003-07-01             0      NA
#> 20 ID4   2003-10-01             0      NA
#> 21 ID4   2005-04-01             0      NA
#> 22 ID4   2005-10-01             0      NA
#> 23 ID4   2007-10-01             1       3
#> 24 ID4   2009-05-01             1       4
#> 25 ID5   2001-05-01             0      NA
#> 26 ID5   2002-04-01             1       1
#> 27 ID5   2005-01-01             0      NA
#> 28 ID5   2008-06-01             1       2
#> 29 ID5   2009-09-01             1       3
#> 30 ID6   2000-12-01             0      NA
#> 31 ID6   2006-08-01             0      NA
#> 32 ID7   2007-06-01             0      NA
#> 33 ID8   1999-09-01             0      NA
#> 34 ID8   2007-05-01             0      NA
#> 35 ID8   2008-11-01             1       1
#> 36 ID8   2009-03-01             1       2
#> 37 ID9   1999-10-01             1       1
#> 38 ID9   2007-04-01             0      NA
#> 39 ID9   2008-01-01             0      NA
#> 40 ID9   2009-10-01             0      NA

Created on 2021-06-19 by the reprex package (v2.0.0)