I'm trying to create a function that will extract search results from this webpage but am getting stuck because I want the function to automate the search for multiple foundations and I'm not sure how to revise the function so that it either loops over all the rows in 'mydf' or uses one of the apply() functions to get the same looping effect in order to scrape results for each row in 'mydf'.
When I run the function below on a single row of 'mydf' the results are correct, but when I don't specify a particular row, I get the following error: Error in parse_url(url) : length(url) == 1 is not TRUE
Sample data.frame:
# construct sample data frame with Name, City, State:
name <- c("johnny carson foundation", "melinda gates foundation", "macarthur foundation")
city <- c("", "", "")
state <- c("", "", "")
mydf <- data.frame(name, city, state)
#replace spaces between words with '+' for consistent formatting of 'url' object:
mydf$name <- str_replace_all(mydf$name, " ", "+")
mydf$city <- str_replace_all(mydf$city, " ", "+")
And my current attempt at the function:
get_data <- function(df) {
# root components of url:
root <- "http://apps.irs.gov/app/eos/allSearch.do?ein1=&names="
root2 <- "&resultsPerPage=25&indexOfFirstRow=0&dispatchMethod=searchAll&city="
root3 <- "&country=US&postDateFrom=&postDateTo=&exemptTypeCode=al&deductibility=all&sortColumn=orgName&isDescending=false&submitName=Search"
# construct url by adding roots and search strings from 'mydf':
url <- paste(root, mydf$name, root2, mydf$city, '&state=', mydf$state, root3, sep = "")
gt <- GET(url)
content2 <- content(gt)
parsedHtml <- htmlParse(content2, asText = TRUE)
# Scraped results to be populated into 'mydf':
mydf$result_org <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE),
"Your search did not return any results"), NA,
xpathSApply(parsedHtml, "//h3[@class='result-orgname']", xmlValue, trim = TRUE)) # Name
mydf$result_ein <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE),
"Your search did not return any results"), NA,
xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[1]", xmlValue)) # EIN
mydf$result_city <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE),
"Your search did not return any results"), NA,
xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[2]", xmlValue)) # City
mydf$result_state <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE),
"Your search did not return any results"), NA,
xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[3]", xmlValue, trim = TRUE)) # State
mydf$result_country <- ifelse(str_starts(xpathSApply(parsedHtml, "//div[@class='row results-body-row']", xmlValue, trim = TRUE),
"Your search did not return any results"), NA,
xpathSApply(parsedHtml, "/html/body/div[3]/div[13]/div/div/div[1]/div[2]/div/ul/li/div[1]/span[4]", xmlValue)) # Country
}
get_data(mydf)
mydf
Thanks very much in advance & apologies for my messy and inelegant code!
All+States
.purrr::map_dfr()