looping over several dataframes using coxph and mstate in R

Question

looping over several dataframes using coxph and mstate in R

57 Views Asked by user22829834 At 17 August 2025 at 23:36

How do write a function or loop to iterate this procedure which uses the mstate() package over several dataframes (cohort1 - cohort25)?

sample data:

impute1 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 22.7, 3.4), 
                  TOBACCO = c(0, 0, 0, 1), 
                  MARRIED = c(1,0,1,0), 
                  PARITY = c(2,1,1,2)) 

impute2 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 22.7, 3.4), 
                  TOBACCO = c(0, 1, 0, 1), 
                  MARRIED = c(1,0,1,1), 
                  PARITY = c(1,1,1,2)) 


covs<-c("TOBACCO", "MARRIED", "PARITY")

Code to run model over 1 dataframe:

cohort1 <- msprep(data=impute1,trans=tmat, 
             time=c(NA,"YEARS_CVD_HOSP","YEARS_CVD_DEATH"),
             status=c(NA,"CVD_ANY","DIED_INDICATOR"), 
             keep=covs,
             id = as.vector(impute1$unique_ID))

cohort_expand<-expand.covs(cohort1, covs, append = TRUE, longnames = FALSE)

c1<-coxph(Surv(Tstart, Tstop, status)~TOBACCO.1 + TOBACCO.2 + 
TOBACCO.3 + strata(trans),     data=cohort1, method = "breslow")
summary(c1)

newdata<-data.frame(trans=1:3, TOBACCO.1 = c(0,0,0), TOBACCO.2 = 
c(0,0,0), TOBACCO.3 = c(0,0,0), strata = 1:3)

msf1<-msfit(c1, newdata, trans=tmat)

plot(msf1, las=1, lty=rep(1:2,c(8,4)), xlab="Years")

I loaded 25 dataframes into a list:

path <- ''
print(path)
files <- list.files(path = path, pattern="*.sas7bdat", full.names=FALSE)
print(files)

impute <- list()
for (i in 1:length(files)){
  filename <- paste0(path, files[i])
  print(filename)
  impute[[i]] <- haven::read_sas(data_file=filename)
  print(names(impute[[i]]))
  eval(parse(text = paste0("impute", i, " <- 
haven::read_sas(data_file=filename)")))
}

I'm trying to write a function for the msprep step using the first two imputed datasets.

test_list <- list(impute1, impute2)

my_func <- function(x) {
    cohort<-mstate::msprep(data=test_list,trans=tmat, 
               time=c(NA,"YEARS_CVD_HOSP","YEARS_CVD_DEATH"),
               status=c(NA,"CVD_ANY","DIED_INDICATOR"), 
               keep=covs,
               id = as.vector(test_list$unique_ID))
}


test<-lapply(test_list, my_func)

I get an error:

Error in mstate::msprep(data = test_list, trans = tmat, time = c(NA, "YEARS_CVD_HOSP", : argument "id" is not a vector

How can I specify unique_ID as a vector even though it is a list?

Next function:

test<-lapply(test_list, my_func2)

my_func2 <- function(x) {
             cohort<-mstate::cohort<-expand.covs(cohort, covs, append = 
             TRUE, longnames = FALSE)
}

I've also tried using the lapply function for iterating coxph model over the dataset but how do I write the code for then entire procedure ?

c1<- lapply(mydf , function(i) {
  
 iformula <- as.formula(sprintf("Surv(Tstart, Tstop, status) 
~TOBACCO.1 + TOBACCO.2 + TOBACCO.3 +   strata(trans)", i))  

})

Original Q&A

There are 1 best solutions below

**AkselA** · Answer 1

I know nothing about proportional hazards regression, but I too your code and put it into a function. It's fairly simple, with a single argument, x, which is passed on to the data and id arguments in msprep(). The rest is kept pretty much as is, except data in coxph() was changed to cohort_expand. tmat and newdata were moved outside the function, as they appear to be constant.

library(mstate)

impute1 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 20.7, 3.4), 
                  TOBACCO = c(0, 0, 0, 1), 
                  MARRIED = c(1,0,1,0), 
                  PARITY = c(2,1,1,2)) 

impute2 <- data.frame(unique_ID = c(1,2,3,4), 
                  DIED_INDICATOR = c(0,1,1,1), 
                  CVD_ANY = c(0,1,1,0), 
                  YEARS_CVD_DEATH = c(15.9, 23.6, 22.7, 3.4), 
                  YEARS_CVD_HOSP = c(15.9, 11.4, 21.7, 3.4), 
                  TOBACCO = c(0, 1, 0, 1), 
                  MARRIED = c(1,0,1,1), 
                  PARITY = c(1,1,1,2)) 


test_list <- list(impute1, impute2)

covs <- c("TOBACCO", "MARRIED", "PARITY")

tmat <- trans.illdeath()

newdata <- data.frame(trans=1:3, TOBACCO.1=c(0,0,0), 
  TOBACCO.2=c(0,0,0), TOBACCO.3=c(0,0,0), strata=1:3)


my_func2 <- function(x) {
    cohort1 <- msprep(data=x, trans=tmat, 
      time=c(NA,"YEARS_CVD_HOSP","YEARS_CVD_DEATH"),
      status=c(NA,"CVD_ANY","DIED_INDICATOR"), 
      keep=covs, id=x$unique_ID)

    cohort_expand <- expand.covs(cohort1, covs, append=TRUE, longnames=FALSE)

    c1 <- coxph(Surv(Tstart, Tstop, status) 
      ~ TOBACCO.1 + TOBACCO.2 + TOBACCO.3 + strata(trans),
      data=cohort_expand, method="breslow")

    msfit(c1, newdata, trans=tmat)
}

fits <- lapply(test_list, my_func2)

par(mfrow=c(2, 1), mar=c(3, 3, 1, 1), mgp=c(1.5, 0.5, 0))
zzz <- lapply(fits, plot)

looping over several dataframes using coxph and mstate in R

There are 1 best solutions below

Related Questions in R

Related Questions in FOR-LOOP

Related Questions in APPLY

Related Questions in LAPPLY

Related Questions in SURVIVAL

Trending Questions

Popular # Hahtags

Popular Questions