Web scraping search results in R

67 Views Asked by At

I'm trying to create a function that will extract search results from this webpage but am getting stuck because I want the function to automate the search for multiple foundations and I'm not sure how to revise the function so that it either loops over all the rows in 'mydf' or uses one of the apply() functions to get the same looping effect in order to scrape results for each row in 'mydf'.

When I run the function below on a single row of 'mydf' the results are correct, but when I don't specify a particular row, I get the following error: Error in parse_url(url) : length(url) == 1 is not TRUE

Sample data.frame:

# construct sample data frame with Name, City, State:
name <- c("johnny carson foundation", "melinda gates foundation", "macarthur foundation")
city <- c("", "", "")
state <- c("", "", "")
mydf <- data.frame(name, city, state)

#replace spaces between words with '+' for consistent formatting of 'url' object:
mydf$name <- str_replace_all(mydf$name, " ", "+")
mydf$city <- str_replace_all(mydf$city, " ", "+")

And my current attempt at the function:

get_data <- function(df) {
        # root components of url:
        root <- "http://apps.irs.gov/app/eos/allSearch.do?ein1=&names="
        root2 <- "&resultsPerPage=25&indexOfFirstRow=0&dispatchMethod=searchAll&city="
        root3 <- "&country=US&postDateFrom=&postDateTo=&exemptTypeCode=al&deductibility=all&sortColumn=orgName&isDescending=false&submitName=Search"
        
     
        # construct url by adding roots and search strings from 'mydf':
        url <- paste(root, mydf$name, root2, mydf$city, '&state=', mydf$state, root3, sep = "")
        
        gt <- GET(url)
        
        content2 <- content(gt)
        
        parsedHtml <- htmlParse(content2, asText = TRUE)
        
        
        # Scraped results to be populated into 'mydf':
        mydf$result_org <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE), 
                                             "Your search did not return any results"), NA,
                                  xpathSApply(parsedHtml, "//h3[@class='result-orgname']", xmlValue, trim = TRUE)) # Name
        mydf$result_ein <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE), 
                                             "Your search did not return any results"), NA,
                                  xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[1]", xmlValue)) # EIN
        mydf$result_city <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE), 
                                              "Your search did not return any results"), NA,
                                   xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[2]", xmlValue)) # City
        mydf$result_state <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE), 
                                               "Your search did not return any results"), NA,
                                    xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[3]", xmlValue, trim = TRUE)) # State
        mydf$result_country <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE), 
                                                 "Your search did not return any results"), NA,
                                      xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[4]", xmlValue)) # Country
        
}


get_data(mydf)


mydf

Thanks very much in advance & apologies for my messy and inelegant code!

1

There are 1 best solutions below

1
On
  1. You need to indicate the state as All+States.
  2. You need to map URLs by purrr::map_dfr()

library(rvest)
# construct sample data frame with Name, City, State:
name <- c("johnny carson foundation", "melinda gates foundation", "macarthur foundation")
city <- c("", "", "")
state <- c("All+States", "All+States", "All+States")
mydf <- data.frame(name, city, state)

#replace spaces between words with '+' for consistent formatting of 'url' object:
mydf$name <- str_replace_all(mydf$name, " ", "+")
mydf$city <- str_replace_all(mydf$city, " ", "+")

# root components of url:
root <- "http://apps.irs.gov/app/eos/allSearch.do?ein1=&names="
root2 <- "&resultsPerPage=25&indexOfFirstRow=0&dispatchMethod=searchAll&city="
root3 <- "&country=US&postDateFrom=&postDateTo=&exemptTypeCode=al&deductibility=all&sortColumn=orgName&isDescending=false&submitName=Search"


# construct url by adding roots and search strings from 'mydf':
url <- paste(root, mydf$name, root2, mydf$city, '&state=', mydf$state, root3, sep = "")

data <- 
  purrr::map_dfr(
    url,
    function(url) {
      items <- read_html(url) %>% html_nodes("ul.views-row > li")
      data.frame(
        name = items %>% html_node("h3") %>% html_text() %>% stringi::stri_trans_totitle(),
        ein = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[1]") %>% html_text(),
        city = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[2]") %>% html_text(),
        state = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[3]") %>% html_text(),
        country = items %>% html_node(xpath = "./div[@class='search-excerpt']/span[4]") %>% html_text()
      )
    }
  )