coerce colon operator or dash from character to vector inside cell in r

169 Views Asked by At

I have some df cells that contain lists of years. Most are years typed out and separated by commas: "1990, 1992, 2001", but some use colon operators: "1990:2000". Even some use a combination: "1990, 1995:1999, 2001"

Is there a way in R to convert these different kinds of strings into consistent comma separated values still within one cell in r?

test2 <- data.frame(series = c(1,2,3),
                    years = c("1990, 1992, 2001", "1990:2000","1990, 1995:1999, 2001"))



# Desired output: 
  series             years
1      1             1990, 1992, 2001
2      2             1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000
3      3             1990, 1995, 1996, 1997, 1998, 1999, 2001

4

There are 4 best solutions below

4
On
test3 <- data.frame(series = c(1,2,3))
test3$years <- list(c(1990, 1992, 2001),
                    c(1990:2000),
                    c(1990, 1995:1999, 2001))
2
On

I think that using strings this way is a bad habit, but you can use eval(parse(text=...))

vector = paste("c(", test2$years, ")", sep="")
#Adding c() notation to eval(parse() understand it as a vector
n = length(vector)

newlist = replicate(n, numeric()) #Creating a empty list
names(newlist) = paste("series", 1:3)

for(i in 1:n){
   newlist[[i]] = eval(parse(text=vector[i]))}
0
On

In case you prefer not to trust to eval(parse()), here is a base R method without it:

sapply(strsplit(test2$years, ", "), function(x) {   
  paste(unlist(lapply(as.list(x), function(y) {    
    if(grepl(":", y)) {
      z <- as.numeric(strsplit(y, ":")[[1]])
      seq(z[1], z[2])
    } else {
        as.numeric(y)
    }})), collapse = ", ")
  })
#> [1] "1990, 1992, 2001"                                                
#> [2] "1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000"
#> [3] "1990, 1995, 1996, 1997, 1998, 1999, 2001"   
0
On

We could use gsubfn

library(gsubfn)
test2$years <- gsubfn("(\\d+):(\\d+)", function(x, y) 
     toString(seq(as.numeric(x), as.numeric(y), by = 1)), test2$years)

-output

test2$years
#[1] "1990, 1992, 2001"                                                
#[2] "1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000"
#[3] "1990, 1995, 1996, 1997, 1998, 1999, 2001"  

Or an option with tidyverse

library(dplyr)
library(tidyr)
library(purrr)
test2 %>%
    separate_rows(years, sep = ",\\s*") %>% 
    separate(years, into = c('years1', 'years2'), convert = TRUE, extra = 'drop') %>%
    transmute(series, years = map2_chr(years1, years2, ~ toString(if(!is.na(.y)) .x:.y
           else .x))) %>% 
    group_by(series) %>% 
    summarise(years = toString(years), .groups = 'drop')

-output

# A tibble: 3 x 2
#  series years                                                           
#   <dbl> <chr>                                                           
#1      1 1990, 1992, 2001                                                
#2      2 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000
#3      3 1990, 1995, 1996, 1997, 1998, 1999, 2001            

Or an option with glue to construct an expression and then evaluate the parsed expression

library(stringr)
test2 %>% 
    mutate(years = glue::glue("c({years})")) %>%
    pull(years) %>% 
    toString %>% 
    str_c("list(", ., ")") %>%
    rlang::parse_expr(.) %>%
    eval
#[[1]]
#[1] 1990 1992 2001

#[[2]]
#[1] 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000

#[[3]]
#[1] 1990 1995 1996 1997 1998 1999 2001