divide a column into multiple using regular expressions in R

50 Views Asked by At

I scrapped a web and I now need to clean the "service" column, which is a string.

service column in the fl_data dataset, you can see that there are multiple services such as Testing Services and Preventions Services. These services are in between \n and : but not all the rows have all the services.

I need to divide the string into columns, each column should have a type of service and its elements.

This is my dataset:

url_base <- "https://npin.cdc.gov/search?=type%3Aorganization&page="

map_df(0:0, function(i) {

  cat(".")

  pg <- read_html(sprintf(url_base, i))

  data.frame(org_name = html_text2(html_nodes(pg, ".block-field-blocknodeorganizationtitle")),
             street = html_text(html_nodes(pg, ".address-line1")),
             city = html_text(html_nodes(pg, ".locality")),
             state = html_text(html_nodes(pg, ".administrative-area")),
             zip = html_text(html_nodes(pg, ".postal-code")),
             service = html_text2(html_nodes(pg, ".services-fieldset")),
             stringsAsFactors=FALSE
             )

}) -> raw_data

fl_data <- raw_data |> 
  filter(state=="FL") |> 
  mutate(service = str_remove(service, "Services\nPlease contact organization for eligibility requirements"))
1

There are 1 best solutions below

0
John Granger On BEST ANSWER

You can use for loops to extract services and corresponding items. In the result, items are separated with ,.

library(tidyverse)
library(rvest)

url <- "https://npin.cdc.gov/search?=type%3Aorganization&page=0"
content <- read_html(url)

services <- content %>% html_nodes(".services-fieldset")
org_name <- content %>% html_nodes(".block-field-blocknodeorganizationtitle") %>% html_text2()


result <- data.frame(org_name = as.integer(), service = as.character(), item = as.character())

for (i in 1:length(services)) {
  temp <- services[i] %>% html_nodes(".field__items")
  for (j in 2:length(temp)) {
    label <- temp[j] %>% html_nodes(".field-label") %>% html_text() %>% gsub(":", "", .)
    items <- temp[j] %>% html_nodes(".field__item") %>% html_text()
    result[nrow(result) +1, ] <- c(org_name[i], label, paste0(items, collapse = ","))
  }
}

result |> tibble()
#> # A tibble: 32 × 3
#>    org_name                          service                     item           
#>    <chr>                             <chr>                       <chr>          
#>  1 Eastport Health Care Incorporated Testing Services            Gonorrhea Test…
#>  2 Eastport Health Care Incorporated Care and Treatment Services Family Plannin…
#>  3 Alamosa County Public Health      Testing Services            TB Testing     
#>  4 Alamosa County Public Health      Care and Treatment Services Mpox Vaccine,H…
#>  5 Alamo Navajo Health Center        Testing Services            TB Testing,Gon…
#>  6 Alamo Navajo Health Center        Prevention Services         TB Prevention/…
#>  7 Alamo Navajo Health Center        Care and Treatment Services Family Plannin…
#>  8 AIDS Resource Group               Testing Services            Hepatitis C Te…
#>  9 AIDS Resource Group               Prevention Services         STD/STI Preven…
#> 10 AIDS Resource Group               Support Services            Support Groups…
#> # ℹ 22 more rows

Created on 2024-03-14 with reprex v2.1.0