Import multiple JSON files and predict the STATUS using the time series data in R

137 Views Asked by At

I am working on pulling multiple JSON files into R

My JSON data looks like this (I have only shown 2 samples but there are close to 800 samples)

{
  "ID": 9, 
  "BCOUNT": 800,
  "MEASUREMENT": [ 
    {
      "MEAS_ID": 1,
      "PDATETIME": "2017-01-14 16:00:59", 
      "STATUS": "Pass",
      "PROCESS_SAMPLES": [ 
        {
          "NUMBER": 1, 
          "LENGTH": 31.5, 
          "HEIGHT": 30.9, 
          "WIDTH": 80.91,  
          "BREADTH": 54 
        },
        {
          "NUMBER": 2, 
          "LENGTH": 41.5, 
          "HEIGHT": 40.9, 
          "WIDTH": 60.91,  
          "BREADTH": 74 
        }
      ]
    }
  ]
}

I have close to 100 files and each file is over ~15 MB. I am trying to convert this in R and do some analysis.

The goal is to predict STATUS given the time series data in the PROCESS SAMPLES

I am trying to do it this way for converting the JSON formats to R data frame so I get a time series data set.

set.seed(12345)
path = "~/data"

packages <- c("jsonlite", "dplyr", "purrr","tidyjson","tidyr","data.table")
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)

temp <- data.frame(list.files(path, pattern="*.json", full.names=TRUE))
colnames(temp)[which(names(temp) == "list.files.path..pattern......json...full.names...TRUE.")] <- "filename"
temp$filename <- as.character(temp$filename)

ls<-list() 

for (j in 1:(nrow(temp)))
{
  cat(j,"\n")

  df <- temp$filename[[j]] %>%
    spread_values(ID = jstring("ID")) %>%
    enter_object("MEASUREMENT") %>% gather_array %>%
    spread_values(MEAS_ID = jnumber("MEAS_ID"), 
                  STATUS = jstring("STATUS"), 
                  PDATETIME = jstring("PDATETIME")) %>%         
    enter_object("PROCESS_SAMPLES") %>% gather_array %>%       
    spread_values(NUMBER = jnumber("NUMBER"),LENGTH = jnumber("LENGTH"),HEIGHT = jnumber("HEIGHT")
                  ,WIDTH = jnumber("WIDTH"),BREADTH = jnumber("BREADTH")) %>%
    select(ID,MEAS_ID,STATUS,PDATETIME,NUMBER,LENGTH,HEIGHT,WIDTH,BREADTH)

  ls[[j]] <- unique(df)
}

df_samples =  do.call("rbind.fill", ls)

This code takes long time to process ~100 files. How can I speed up this process? and also how do I go about doing this analysis of predicting the STATUS

Could someone point me in the right direction?

1

There are 1 best solutions below

0
On
content <- list( ' {   "ID": 9,   "BCOUNT": 800,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-14 16:00:59",   "STATUS": "Pass",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 31.5,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 2,   "LENGTH": 41.5,   "HEIGHT": 40.9,   "WIDTH": 60.91,    "BREADTH": 74   }   ]   }   ]   } '
                 ,
                 ' {   "ID": 10,   "BCOUNT": 900,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-15 16:00:59",   "STATUS": "Pass",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 31.5,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 1,   "LENGTH": 33.5,   "HEIGHT": 34.9,   "WIDTH": 92.91,    "BREADTH": 12   },   {   "NUMBER": 2,   "LENGTH": 41.5,   "HEIGHT": 40.9,   "WIDTH": 60.91,    "BREADTH": 74   }   ]   }   ]   } '
                 ,
                 ' {   "ID": 11,   "BCOUNT": 900,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-16 16:00:59",   "STATUS": "Fail",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 100,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 1,   "LENGTH": 120,   "HEIGHT": 34.9,   "WIDTH": 92.91,    "BREADTH": 12   },   {   "NUMBER": 2,   "LENGTH": 130,   "HEIGHT": 40.9,   "WIDTH": 60.91,    "BREADTH": 74   }   ]   }   ]   } ' 
                 ,
                 ' {   "ID": 12,   "BCOUNT": 900,   "MEASUREMENT": [   {   "MEAS_ID": 1,   "PDATETIME": "2017-01-17 16:00:59",   "STATUS": "Fail",   "PROCESS_SAMPLES": [   {   "NUMBER": 1,   "LENGTH": 220,   "HEIGHT": 30.9,   "WIDTH": 80.91,    "BREADTH": 54   },   {   "NUMBER": 1,   "LENGTH": 200,   "HEIGHT": 34.9,   "WIDTH": 92.91,    "BREADTH": 12   }   ]   }   ]   } ' 
)
result <- do.call(plyr::rbind.fill, lapply(content, function(js){
  dat<-jsonlite::fromJSON(js)
  do.call(cbind, 
          list(
            ID=dat$ID,
            BCOUNT=dat$BCOUNT,
            { df <- dat$MEASUREMENT[,c("MEAS_ID","PDATETIME","STATUS")]
              rownames(df) <- NULL
              df
            },
            {
              df<-dat$MEASUREMENT[["PROCESS_SAMPLES"]][[1]]
              rownames(df) <- NULL
              df
            }  
          )
  )

}))
result$PDATETIME<-as.POSIXct(result$PDATETIME)
result$STATUS<-as.numeric(factor(result$STATUS, levels=c("Pass","Fail")))
fit <- glm(STATUS ~ ., data=result) #Don't actually use this! You have to experiment with different models to find out what works..
predict(fit, result[c(1,10),]) #Don't actually do this either -- you have to create  training, tuning and testing sets.