Parsing issues when scraping

101 Views Asked by At

I'm having trouble with the code below. The function test is used to get data from a website and works pretty well for all values of i from 2 to 33000 (no matter). But when it comes to get all the pages with my loop, I get parsing errors and multiple identical lines in my dataframe.

library(rvest)
library(chromote)
library(jsonlite)
library(dplyr)


test=function(i){
  b <- ChromoteSession$new()
  p=b$Page$loadEventFired(wait_ = FALSE)
  b$Page$navigate(paste("https://www.ecologie.gouv.fr/sru_api/api/towns/",i,sep=""),wait_ = FALSE)
  b$wait_for(p)
  html <- b$Runtime$evaluate('document.documentElement.outerHTML')
  content <- read_html(html$result$value)
  data_json=html_text(content)
  df=fromJSON(data_json)
  return(df)}



ma_liste <- list()
n=100
for (i in 2:n){
  tryCatch({
    ma_liste <- c(ma_liste, list(test(i)))
  })
}

ma_liste
dataframe <- do.call(rbind, ma_liste)
dataframe <- as.data.frame(dataframe)

I tried to ignore the problematic lines with tryCatch but it doesn't fix the issue of multiple lines (and skips a lot of data). Can you help me on this ? Thanks.

2

There are 2 best solutions below

1
On BEST ANSWER

The problem persist on my own computer. Since the connection was the problem, I demanded that the loop tries again for every failed iteration with trycatch and it works fine for me. I conclude that my problem is my proxy/firewall or something independent from the code you will all be able to provide me with. Now remains the problem of the speed of execution but that is less of a matter to me.

library(rvest)
library(chromote)
library(jsonlite)
library(dplyr)
library(progress)

test <- function(i) {
  b <- ChromoteSession$new()
  p <- b$Page$loadEventFired(wait_ = FALSE)
  b$Page$navigate(paste("https://www.ecologie.gouv.fr/sru_api/api/towns/", i, sep = ""), wait_ = FALSE)
  b$wait_for(p)
  html <- b$Runtime$evaluate('document.documentElement.outerHTML')
  content <- read_html(html$result$value)
  data_json <- html_text(content)
  df <- fromJSON(data_json)
  b$close()
  return(df)
}

start.time <- Sys.time()
ma_liste <- list()
n <- 100
pb <- progress_bar$new(total = n)
for (i in 2:n) {
  pb$tick()
  retry <- TRUE
  while (retry) {
    tryCatch({
      ma_liste <- c(ma_liste, list(test(i)))
      retry <- FALSE  # Pas d'erreur, donc pas besoin de réessayer
    }, error = function(e) {
      message("", i, ": ", conditionMessage(e))
      Sys.sleep(0.001)  # Attendre un certain temps avant de réessayer
    })
  }
}

dataframe <- do.call(rbind, ma_liste)
dataframe <- as.data.frame(dataframe)
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken
4
On
library(tidyverse)
library(jsonlite)

scraper <- function(index) {
  "https://www.ecologie.gouv.fr/sru_api/api/towns/" %>% 
    str_c(., index) %>% 
    fromJSON() %>% 
    modify( ~ if(is.null(.x)) NA else .x)
}

df <- map_dfr(2:20, possibly(scraper, otherwise = NULL))

# A tibble: 19 × 46
   sru_id sru_structure sru_region sru_dep sru_insee sru_commune           sru_pop_commune sru_tx_lls_obj
    <int> <chr>         <chr>      <chr>   <chr>     <chr>                 <chr>           <chr>         
 1      2 ""            ""         01      01043     Beynost               4557            25%           
 2      3 ""            ""         01      01142     Dagneux               4706            25%           
 3      4 ""            ""         01      01160     Ferney-Voltaire       9637            25%           
 4      5 ""            ""         01      01249     Miribel               9742            25%           
 5      6 ""            ""         01      01262     Montluel              7005            25%           
 6      7 ""            ""         01      01281     Ornex                 4400            25%           
 7      8 ""            ""         01      01313     Prévessin-Moëns       7991            25%           
 8      9 ""            ""         01      01322     Reyrieux              4670            25%           
 9     10 ""            ""         01      01344     Saint-Denis-lès-Bourg 5667            20            
10     11 ""            ""         01      01354     Saint-Genis-Pouilly   11892           25%           
11     12 ""            ""         01      01419     Thoiry                6094            25%           
12     13 ""            ""         01      01451     Viriat                6350            20            
13     14 ""            ""         03      03013     Avermes               3907            20%           
14     15 ""            ""         03      03023     Bellerive-sur-Allier  8501            20%           
15     16 ""            ""         03      03310     Vichy                 24383           20%           
16     17 ""            ""         03      03321     Yzeure                13230           20%           
17     18 ""            ""         04      04112     Manosque              21868           25%           
18     19 ""            ""         04      04143     Oraison               5917            25%           
19     20 ""            ""         04      04152     Pierrevert            3743            25%           
# ℹ 38 more variables: sru_nom_agglo <chr>, sru_nom_epci <chr>, sru_pfh <chr>, sru_nb_res_prin <chr>,
#   sru_nb_lls2019 <chr>, sru_tx_lls2019 <chr>, sru_nb_lls2014 <chr>, sru_tx_lls2014 <chr>,
#   sru_tx_lls2011 <chr>, sru_tx_lls2008 <chr>, sru_tx_lls2005 <chr>, sru_tx_lls2002 <chr>,
#   sru_exo <chr>, sru_prel_brut <chr>, sru_maj_brut <chr>, sru_prel_brut_tot <chr>,
#   sru_constat_car <chr>, sru_date_arrete <chr>, sru_tx_maj_prel_brut <chr>, sru_benef_locaux <chr>,
#   sru_prel_net <chr>, sru_prel_maj_nette <chr>, sru_condition <chr>, sru_condition_2_motif <int>,
#   sru_commune_prelv <lgl>, sru_commune_no_prelv <lgl>, sru_commune_no_info <lgl>, …