looping over several dataframes using coxph and mstate in R

60 Views Asked by At

How do write a function or loop to iterate this procedure which uses the mstate() package over several dataframes (cohort1 - cohort25)?

sample data:

impute1 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 22.7, 3.4), 
                  TOBACCO = c(0, 0, 0, 1), 
                  MARRIED = c(1,0,1,0), 
                  PARITY = c(2,1,1,2)) 

impute2 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 22.7, 3.4), 
                  TOBACCO = c(0, 1, 0, 1), 
                  MARRIED = c(1,0,1,1), 
                  PARITY = c(1,1,1,2)) 


covs<-c("TOBACCO", "MARRIED", "PARITY")

Code to run model over 1 dataframe:

cohort1 <- msprep(data=impute1,trans=tmat, 
             time=c(NA,"YEARS_CVD_HOSP","YEARS_CVD_DEATH"),
             status=c(NA,"CVD_ANY","DIED_INDICATOR"), 
             keep=covs,
             id = as.vector(impute1$unique_ID))

cohort_expand<-expand.covs(cohort1, covs, append = TRUE, longnames = FALSE)

c1<-coxph(Surv(Tstart, Tstop, status)~TOBACCO.1 + TOBACCO.2 + 
TOBACCO.3 + strata(trans),     data=cohort1, method = "breslow")
summary(c1)

newdata<-data.frame(trans=1:3, TOBACCO.1 = c(0,0,0), TOBACCO.2 = 
c(0,0,0), TOBACCO.3 = c(0,0,0), strata = 1:3)

msf1<-msfit(c1, newdata, trans=tmat)

plot(msf1, las=1, lty=rep(1:2,c(8,4)), xlab="Years")

I loaded 25 dataframes into a list:

path <- ''
print(path)
files <- list.files(path = path, pattern="*.sas7bdat", full.names=FALSE)
print(files)

impute <- list()
for (i in 1:length(files)){
  filename <- paste0(path, files[i])
  print(filename)
  impute[[i]] <- haven::read_sas(data_file=filename)
  print(names(impute[[i]]))
  eval(parse(text = paste0("impute", i, " <- 
haven::read_sas(data_file=filename)")))
}

I'm trying to write a function for the msprep step using the first two imputed datasets.

test_list <- list(impute1, impute2)

my_func <- function(x) {
    cohort<-mstate::msprep(data=test_list,trans=tmat, 
               time=c(NA,"YEARS_CVD_HOSP","YEARS_CVD_DEATH"),
               status=c(NA,"CVD_ANY","DIED_INDICATOR"), 
               keep=covs,
               id = as.vector(test_list$unique_ID))
}


test<-lapply(test_list, my_func)

I get an error:

Error in mstate::msprep(data = test_list, trans = tmat, time = c(NA, "YEARS_CVD_HOSP", : argument "id" is not a vector

How can I specify unique_ID as a vector even though it is a list?

Next function:

test<-lapply(test_list, my_func2)

my_func2 <- function(x) {
             cohort<-mstate::cohort<-expand.covs(cohort, covs, append = 
             TRUE, longnames = FALSE)
}

I've also tried using the lapply function for iterating coxph model over the dataset but how do I write the code for then entire procedure ?

c1<- lapply(mydf , function(i) {
  
 iformula <- as.formula(sprintf("Surv(Tstart, Tstop, status) 
~TOBACCO.1 + TOBACCO.2 + TOBACCO.3 +   strata(trans)", i))  

})
1

There are 1 best solutions below

1
On

I know nothing about proportional hazards regression, but I too your code and put it into a function. It's fairly simple, with a single argument, x, which is passed on to the data and id arguments in msprep(). The rest is kept pretty much as is, except data in coxph() was changed to cohort_expand. tmat and newdata were moved outside the function, as they appear to be constant.

library(mstate)

impute1 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 20.7, 3.4), 
                  TOBACCO = c(0, 0, 0, 1), 
                  MARRIED = c(1,0,1,0), 
                  PARITY = c(2,1,1,2)) 

impute2 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 21.7, 3.4), 
                  TOBACCO = c(0, 1, 0, 1), 
                  MARRIED = c(1,0,1,1), 
                  PARITY = c(1,1,1,2)) 


test_list <- list(impute1, impute2)

covs <- c("TOBACCO", "MARRIED", "PARITY")

tmat <- trans.illdeath()

newdata <- data.frame(trans=1:3, TOBACCO.1=c(0,0,0), 
  TOBACCO.2=c(0,0,0), TOBACCO.3=c(0,0,0), strata=1:3)


my_func2 <- function(x) {
    cohort1 <- msprep(data=x, trans=tmat, 
      time=c(NA,"YEARS_CVD_HOSP","YEARS_CVD_DEATH"),
      status=c(NA,"CVD_ANY","DIED_INDICATOR"), 
      keep=covs, id=x$unique_ID)

    cohort_expand <- expand.covs(cohort1, covs, append=TRUE, longnames=FALSE)

    c1 <- coxph(Surv(Tstart, Tstop, status) 
      ~ TOBACCO.1 + TOBACCO.2 + TOBACCO.3 + strata(trans),
      data=cohort_expand, method="breslow")

    msfit(c1, newdata, trans=tmat)
}

fits <- lapply(test_list, my_func2)

par(mfrow=c(2, 1), mar=c(3, 3, 1, 1), mgp=c(1.5, 0.5, 0))
zzz <- lapply(fits, plot)

enter image description here