Replace NA in list of dfs in certain columns and under certain conditions

38 Views Asked by At

I have a list of dfs:

my_list <- list(structure(list(col1 = c("v1", "v2", "v3", "V2", "V1"), col2 = c("wood", NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, -2L)), structure(list(col1 = c("v1", "v2", "v3", "V3"), col3 = c("cup", NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, -4L)))
  
my_list

[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA> <NA>   pear
3   v3 water fork banana
4   v2  <NA> <NA>   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2 <NA>   pear
3   v3 <NA> banana
4   v3 <NA>   <NA>

I want to replace NA with "VAL" in col3 only, and only if col1 is v2 or v3.

I found solutions to replace NA in certain columns, but not in certain columns and other conditions (or only for a single df, not for a list of dfs.)

Note that col2 or col3 do not necessarily exist in all dfs.

I need a solution with lapply(list, function), ideally.

Desired output:

  [[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear 
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>
3

There are 3 best solutions below

2
jay.sf On BEST ANSWER

In such cases for loops can be much faster.

> for (s in seq_along(my_list)) {
+   x <- my_list[[s]]
+   if ('col3' %in% names(x)) {
+     x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
+     my_list[[s]] <- x
+   }
+ }
> my_list
[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>

Benchmark

Runs 80% faster, which is quite significant. Demonstrated on a list with just 1,000 elements.

$ Rscript --vanilla foo.R
Unit: milliseconds
   expr       min        lq      mean    median        uq       max neval cld
  floop  18.84617  19.95898  22.17803  22.08178  24.21662  27.02679   100  a 
 lapply 100.05645 106.24458 111.66269 111.17931 116.06089 150.08886   100   b

Benchmark code

set.seed(42)
big_list <- my_list[sample(1:3, 1e3, replace=TRUE)]

microbenchmark::microbenchmark(
  floop={
    for (s in seq_along(big_list)) {
      x <- big_list[[s]]
      if ('col3' %in% names(x)) {
        x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
        big_list[[s]] <- x
      }
    }
    big_list
  },
  lapply=lapply(
    big_list,
    \(x) if ('col3' %in% names(x)) {
      transform(x, col3=replace(col3, 
                                is.na(col3) & col1 %in% c('v2', 'v3'), 
                                'VAL'))
    } else {
      x
    }),
  check='identical')

Edit

Add a "col3" that is filled with "VAL" if none exists yet:

> for (s in seq_along(my_list)) {
+   x <- my_list[[s]]
+   if ('col3' %in% names(x)) {
+     x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
+   } else {
+     x$col3 <- 'VAL'
+   }
+   my_list[[s]] <- x
+ }
> my_list
[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4 col3
1   v1 wood <NA>  VAL
2   v2 <NA> pear  VAL

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>

Data:

> dput(my_list)
list(structure(list(col1 = c("v1", "v2", "v3", "v2", "v1"), col2 = c("wood", 
NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA
), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, 
-5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", 
NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, 
-2L)), structure(list(col1 = c("v1", "v2", "v3", "v3"), col3 = c("cup", 
NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, 
-4L)))
2
arg0naut91 On

You could try:

lapply(
   my_list,
   \(x) if ('col3' %in% names(x)) transform(x, col3 = replace(col3, is.na(col3) & col1 %in% c('v2', 'v3'), 'VAL')) else x
)

Output:

[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear
3   v3 water fork banana
4   V2  <NA>  VAL   <NA>
5   V1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   V3  VAL   <NA>
0
margusl On

Another slight variation for benchmark, though the change is not that radical.

df_update <- function(df) {
  if("col3" %in% names(df)){
    df$col3[is.na(df$col3) & (df$col1 %in% c("v2", "v3"))] <- "VAL"
  } 
  df
}
lapply(my_list, df_update)

# microbenchmark::microbenchmark() :
#> Unit: milliseconds
#>           expr     min       lq       mean    median       uq      max neval
#>  lapply_subset  7.9087  8.46455   9.880332   8.72160   9.6610  24.2403   100
#>          floop 12.8077 13.43235  15.302831  13.80615  14.6502  35.7991   100
#>         lapply 91.1871 95.82220 101.641103 100.16710 104.2413 156.0194   100

my_list <- list(structure(list(col1 = c("v1", "v2", "v3", "V2", "V1"), col2 = c("wood", NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, -2L)), structure(list(col1 = c("v1", "v2", "v3", "V3"), col3 = c("cup", NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, -4L)))

set.seed(42)
big_list <- my_list[sample(1:3, 1e3, replace=TRUE)]

microbenchmark::microbenchmark(
  lapply_subset=lapply(big_list, df_update),
  floop={
    for (s in seq_along(big_list)) {
      x <- big_list[[s]]
      if ('col3' %in% names(x)) {
        x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
        big_list[[s]] <- x
      }
    }
    big_list
  },
  lapply=lapply(
    big_list,
    \(x) if ('col3' %in% names(x)) {
      transform(x, col3=replace(col3, 
                                is.na(col3) & col1 %in% c('v2', 'v3'), 
                                'VAL'))
    } else {
      x
    }),
  check='identical')