Adding values form a website to a table when scraping HTML

129 Views Asked by At

This is the data that I need:

https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0

I already imported the table into R:

library(tidyverse)
library(rvest)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

tbls <- html_nodes(webpage, "table")
tbls_ls <- webpage %>%
  html_nodes("table") %>%
  .[5] %>%
  html_table(fill = TRUE)

data = as.tibble(tbls_ls[[1]]) 

Yet, I need to add one more thing to the table. For some meteorites, there are oxygen isotope values available. One can see this when clicking on the name of the meteorite under the section "plots". When clicking on the plot, we get redirected to a page where we have the three isotope values. What I want to do is to add three columns to my table, containing the respective isotope values for each meteorite. I tried writing code for each "plot" section separately, but I feel like there could be a much more elegant solution for this.

1

There are 1 best solutions below

19
On BEST ANSWER

You could grab the table without isotopes, then mimic the post request the page does if you decide to go with isotopes; then left-join the two on Name column. You will get more rows back than were in left table (no isotopes) because there are multiple Change values, but this matches with what you see in the method of viewing isotopes you describe, where there are comma separated lists of values against isotopes, within plots, rather than split out by rows.

I go for a more selective css selector to target the specific table of interest initially, rather than indexing into lists.

I use write_excel_csv to preserve the character encoding of headers on write out (an idea I got from @stefan).

You can remove columns you don't want in output from joint_table before writing out (subset/select etc).


r

library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T) 

data <- list(
  'sfor' = "names",
  'stype' = "contains",
  'country' = "All",
  'categ' = "Ungrouped achondrites",
  'page' = "0",
  'map' = "ge",
  'srt' = "name",
  'lrec' = "200",
  'pnt' = "Oxygen isotopes",
  'mblist' = "All",
  'snew' = "0",
  'sea' = "*"
)


r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)


joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")

Example output:

enter image description here


Edit:

Adding in the additional information that comes from other urls as per your request in comments. I had to dynamically determine which table number to pick up, as well as handle cases where no table present.

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#> 
#>     pluck
#> The following object is masked from 'package:readr':
#> 
#>     guess_encoding
library(readr)
library(furrr)

get_table <- function(url) {
  page <- read_html(url)
  test_list <- page %>%
    html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
    html_text() # get left hand column %>%
  index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
  table <- page %>%
    html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
    html_table() %>%
    as_tibble()
  temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
  return(temp)
}


start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T)

data <- list(
  "sfor" = "names",
  "stype" = "contains",
  "country" = "All",
  "categ" = "Ungrouped achondrites",
  "page" = "0",
  "map" = "ge",
  "srt" = "name",
  "lrec" = "200",
  "pnt" = "Oxygen isotopes",
  "mblist" = "All",
  "snew" = "0",
  "sea" = "*"
)

r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)

joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

lookups <- webpage %>%
  html_node("#maintable") %>%
  html_nodes("td:nth-of-type(1) a") %>%
  map_df(~ c(html_text(.), html_attr(., "href")) %>%
    set_names("Name", "Link")) %>%
  mutate(Link = paste0(base, gsub("\\s+", "%20", Link)))

error_df <- tibble(
  `State/Prov/County:` = NA_character_,
  `Origin or pseudonym:` = NA_character_,
  `Date:` = NA_character_,
  `Latitude:` = NA_character_,
  `Longitude:` = NA_character_,
  `Mass (g):` = NA_character_,
  `Pieces:` = NA_character_,
  `Class:` = NA_character_,
  `Shock stage:` = NA_character_,
  `Fayalite (mol%):` = NA_character_,
  `Ferrosilite (mol%):` = NA_character_,
  `Wollastonite (mol%):` = NA_character_,
  `Magnetic suscept.:` = NA_character_,
  `Classifier:` = NA_character_,
  `Type spec mass (g):` = NA_character_,
  `Type spec location:` = NA_character_,
  `Main mass:` = NA_character_,
  `Finder:` = NA_character_,
  `Comments:` = NA_character_,
)

no_cores <- future::availableCores() - 1

future::plan(future::multisession, workers = no_cores)

df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))

colnames(df) <- sub(":", "", colnames(df))

df2 <- df %>%
  mutate(
    `Mass (g)` = gsub(",", "", `Mass (g)`),
    across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
  )

if (nrow(df2) == nrow(no_isotopes)) {
  additional_info <- cbind(lookups, df2)
  joint_table$Name <- gsub(" \\*\\*", "", joint_table$Name)
  final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
  write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}

Created on 2021-02-27 by the reprex package (v0.3.0)


N.B.

OP had problems with lookups variable for some reason so here is an alternative I wrote that worked for them:

lookups <- map_df(
  webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a") , ~
    data.frame(
      Name = .x %>% html_text(),
      Link =  paste0(base, gsub("\\s+", "%20", .x %>%  html_attr("href")))
    )
) %>% as_tibble()