Download file with phantomJS R

65 Views Asked by At

I want to download a file using webdriver package (phantomJS), however whenever I click/enter the object, nothing happens. I intend to click on 'Download dos dados' and then click and download 'Baixar .csv'.

library(webdriver)

url <-'https://idesevis.dee.rs.gov.br/#tab-9023-6'

#webdriver::install_phantomjs() # If it is not installed
pjs <- run_phantomjs()

ses <- Session$new(port = pjs$port)
ses$go(url)
ses$getUrl()

### Click on 'Download dos dados'
search <- ses$findElement(xpath='/html/body/div[2]/nav/div/ul/li[6]/a')
search$sendKeys('html',key$enter)

### Finds and clicks 'Baixar .csv'
search <- ses$findElement(xpath='/html/body/div[2]/div[3]/div/div[7]/div/div[1]/form/a')
search$sendKeys('html',key$enter)
ses$takeScreenshot()

Apparently, I'm not selecting the object 'Baixar .csv' even using the full xpath. I'm using webdriver as I can't use RSelenium on my notebook and static webscraping it is not suitable here as the session code alters everytime we access the main url.

1

There are 1 best solutions below

0
margusl On BEST ANSWER

CSV download, at least in this particular case with Shiny-driven page, also works outside of the browser session, so instead of triggering download in PhantomJS, you could extract href from CSV link and pass it to download.file() / httr(2) / curl, for example.

library(webdriver)
library(httr2)
library(stringr)
library(readr)

url_ <-'https://idesevis.dee.rs.gov.br/#tab-9023-6'

#webdriver::install_phantomjs() # If it is not installed
pjs <- run_phantomjs()

ses <- Session$new(port = pjs$port)
ses$go(url_)
ses$getUrl()
#> [1] "https://idesevis.dee.rs.gov.br/#tab-9023-6"

### Click on 'Download dos dados'
ses$findElement("a[data-value='Download dos Dados']")$sendKeys('html',key$enter)
ses$getUrl()
#> [1] "https://idesevis.dee.rs.gov.br/#tab-9023-6"

### Wait until downloadData element is available and href is set;
### defaults: checkInterval = 100, timeout = 3000;
### find a#downloadData and get download link
ses$waitFor('document.getElementById("downloadData").getAttribute("href")')
#> [1] TRUE

(csv_url <- ses$findElement("a#downloadData")$getAttribute("href"))
#> [1] "https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w="

### Test url and extract filename from response headers
(resp_head <- request(csv_url) |> req_method("HEAD") |> req_perform())
#> <httr2_response>
#> HEAD
#> https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w=
#> Status: 200 OK
#> Content-Type: text/csv
#> Body: Empty

filename <- resp_header(resp_head, header = "content-disposition") |> 
  str_split_i("=", 2) |>
  str_remove_all('\\"')
filename
#> [1] "base_idese.csv"

### Fetch CSV
request(csv_url) |> req_perform(path = filename)
#> <httr2_response>
#> GET
#> https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w=
#> Status: 200 OK
#> Content-Type: text/csv
#> Body: On disk 'body'
fs::file_info(filename)[,1:3]
#> # A tibble: 1 × 3
#>   path           type         size
#>   <fs::path>     <fct> <fs::bytes>
#> 1 base_idese.csv file        11.8M

Downloaded dataset:

# Read with correct encoding
read_csv("base_idese.csv", locale = locale(encoding = "ISO-8859-1"))
#> New names:
#> • `` -> `...1`
#> Rows: 114720 Columns: 7
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (4): TIPO_UNID, COD, NOME, CATEGORIA
#> dbl (3): ...1, ANO, VALOR
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 114,720 × 7
#>     ...1 TIPO_UNID  COD     NOME            CATEGORIA                  ANO VALOR
#>    <dbl> <chr>      <chr>   <chr>           <chr>                    <dbl> <dbl>
#>  1     1 Municípios 4300059 Água Santa      "Bloco Renda\\Apropriaç…  2013 0.919
#>  2     2 Municípios 4304804 Carlos Barbosa  "Bloco Renda\\Apropriaç…  2013 0.929
#>  3     3 Municípios 4300901 Aratiba         "Bloco Renda\\Apropriaç…  2013 0.746
#>  4     4 Municípios 4310462 Ipiranga do Sul "Bloco Renda\\Apropriaç…  2013 0.924
#>  5     5 Municípios 4322806 Veranópolis     "Bloco Renda\\Apropriaç…  2013 0.835
#>  6     6 Municípios 4321634 Três Arroios    "Bloco Renda\\Apropriaç…  2013 1    
#>  7     7 Municípios 4313334 Nova Ramada     "Bloco Renda\\Apropriaç…  2013 0.741
#>  8     8 Municípios 4304903 Casca           "Bloco Renda\\Apropriaç…  2013 0.793
#>  9     9 Municípios 4314001 Paraí           "Bloco Renda\\Apropriaç…  2013 0.849
#> 10    10 Municípios 4322350 União da Serra  "Bloco Renda\\Apropriaç…  2013 0.843
#> # ℹ 114,710 more rows

Created on 2023-12-27 with reprex v2.0.2