Improve performance API CAll in R with furrr

69 Views Asked by At

The code below performs an API request. Basically I inform two cities (origin and destination) and I must return which bus companies makes the connection between these cities. If there is not a line between these cities just ignores. Each city is represented by a 3-digit code and it forms the input parameter of the requests.

library(data.table)
library(RCurl)
library(jsonlite)
library(dplyr)
library(tidyverse)
library(furrr)

#sample of data
from_to_df <- data.frame(
  origin =  200:300,
  destination = 300:400) %>% 
    filter(origin != destination) %>% 
    mutate(parameter = paste("origem=",origin,"&destino=",destination,"&__RequestVerificationToken=Z-wXmGOb9pnQbmkfcQXmChT-6uc3YfGjftHwK4HnC9SDCaKmzIafo7AI3lChBY6YDBHdpT_X98mSHGAr_YrTNgKiepKxKraGu7p6PI7dV4g1", sep ="")
           )


download_lines <- function(from_to_df){
  
  from_to_lines <- list(origin = vector("list", length = 1000000),
                        destination = vector("list", length = 1000000),
                        company = vector("list", length = 1000000))
  
  headers = c(
    "Accept" = "application/json, text/javascript, */*; q=0.01",
    "Accept-Language" = "en-US,en;q=0.9",
    "Connection" = "keep-alive",
    "Content-Type" = "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie" = "__RequestVerificationToken_L1RyYW5zcG9ydGVDb2xldGl2bw2=tY-yKlWmbZvAJzMHmITkohPiIos5XkjDBwf1ZBfP_bYWdXJMBF2Qw3z_B-LRVo0kXjdnHqDqsbZ04Zij_PM-wAf4DWVKfnQskOhqo4ANSRc1",
    "Origin" = "http://extranet.artesp.sp.gov.br",
    "Referer" = "http://extranet.artesp.sp.gov.br/TransporteColetivo/OrigemDestino?fbclid=IwAR3_hZwajHk_iyU085S1LDTqLCOYLHIZ5K825XgPGcB4tMI0EuCJpQNrJHM",
    "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    "X-Requested-With" = "XMLHttpRequest"
  )
  z <- 1
  for (j in 1:nrow(from_to_df)) {
    res <- postForm("http://extranet.artesp.sp.gov.br/TransporteColetivo/OrigemDestino/GetGrid", .opts=list(postfields = from_to_df[[j, "parameter"]], httpheader = headers, followlocation = TRUE), style = "httppost")
    if (res != "[]"){
      from_to_lines$origin[[z]] <- from_to_df[[j, "origin"]]
      from_to_lines$destination[[z]] <- from_to_df[[j, "destination"]]
      from_to_lines$company[[z]] <- jsonlite::fromJSON(res)[[2]]
      z <- z + 1
      }
    }
  return(from_to_lines) 
  }

from_to_df_lines <- download_lines(from_to_df)

The program itself works and I can list the companies. The problem is that I need to make about 415000 requests. From the tests I've performed all these requests it's going to take more than 12 hours to finish. In a previous post it was suggested to use the furr package in order to perform the different requests in parallel. This is a package that I do not yet master well. I Try to adapt the program as below (download_lines_furrr) using the future_pwalk function

download_lines_furrr <- function(from_to_df){
  
  from_to_lines <- list(origin = vector("list", length = 1000000),
                         destination = vector("list", length = 1000000),
                          company = vector("list", length = 1000000))
  
  #RCURL
  headers = c(
    "Accept" = "application/json, text/javascript, */*; q=0.01",
    "Accept-Language" = "en-US,en;q=0.9",
    "Connection" = "keep-alive",
    "Content-Type" = "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie" = "__RequestVerificationToken_L1RyYW5zcG9ydGVDb2xldGl2bw2=tY-yKlWmbZvAJzMHmITkohPiIos5XkjDBwf1ZBfP_bYWdXJMBF2Qw3z_B-LRVo0kXjdnHqDqsbZ04Zij_PM-wAf4DWVKfnQskOhqo4ANSRc1",
    "Origin" = "http://extranet.artesp.sp.gov.br",
    "Referer" = "http://extranet.artesp.sp.gov.br/TransporteColetivo/OrigemDestino?fbclid=IwAR3_hZwajHk_iyU085S1LDTqLCOYLHIZ5K825XgPGcB4tMI0EuCJpQNrJHM",
    "User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    "X-Requested-With" = "XMLHttpRequest"
  )
  z <- 1
  
   
    
    res <- postForm("http://extranet.artesp.sp.gov.br/TransporteColetivo/OrigemDestino/GetGrid", .opts=list(postfields = from_to_df[["parameter"]], httpheader = headers, followlocation = TRUE), style = "httppost")
    if (res != "[]"){
      from_to_lines$origin[[z]] <- from_to_df[["origin"]]
      from_to_lines$destination[[z]] <- from_to_df[["destination"]]
      from_to_lines$company[[z]] <- jsonlite::fromJSON(res)[[2]]
      z <- z + 1
      
    }
    
  return(from_to_lines) 
  
}


future::plan(future::multisession)
future_pwalk(from_to_df, download_lines_furrr)

, but this error always occurs:

Error in ...furrr_fn(...) : unused arguments (origin = 200, destination = 300, parameter = "origem=200&destino=300&__RequestVerificationToken=Z-wXmGOb9pnQbmkfcQXmChT-6uc3YfGjftHwK4HnC9SDCaKmzIafo7AI3lChBY6YDBHdpT_X98mSHGAr_YrTNgKiepKxKraGu7p6PI7dV4g1")

I would like to know how to adapt this routine with FURRR or another way to reduce the execution time of requests.

0

There are 0 best solutions below