Replace NA in list of dfs in certain columns and under certain conditions

Question

Replace NA in list of dfs in certain columns and under certain conditions

38 Views Asked by johnjohn At 31 March 2024 at 17:58

I have a list of dfs:

my_list <- list(structure(list(col1 = c("v1", "v2", "v3", "V2", "V1"), col2 = c("wood", NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, -2L)), structure(list(col1 = c("v1", "v2", "v3", "V3"), col3 = c("cup", NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, -4L)))
  
my_list

[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA> <NA>   pear
3   v3 water fork banana
4   v2  <NA> <NA>   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2 <NA>   pear
3   v3 <NA> banana
4   v3 <NA>   <NA>

I want to replace NA with "VAL" in col3 only, and only if col1 is v2 or v3.

I found solutions to replace NA in certain columns, but not in certain columns and other conditions (or only for a single df, not for a list of dfs.)

Note that col2 or col3 do not necessarily exist in all dfs.

I need a solution with lapply(list, function), ideally.

Desired output:

  [[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear 
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>

Original Q&A

There are 3 best solutions below

arg0naut91 On 31 March 2024 at 18:16

You could try:

lapply(
   my_list,
   \(x) if ('col3' %in% names(x)) transform(x, col3 = replace(col3, is.na(col3) & col1 %in% c('v2', 'v3'), 'VAL')) else x
)

Output:

[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear
3   v3 water fork banana
4   V2  <NA>  VAL   <NA>
5   V1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   V3  VAL   <NA>

margusl On 31 March 2024 at 21:53

Another slight variation for benchmark, though the change is not that radical.

df_update <- function(df) {
  if("col3" %in% names(df)){
    df$col3[is.na(df$col3) & (df$col1 %in% c("v2", "v3"))] <- "VAL"
  } 
  df
}
lapply(my_list, df_update)

# microbenchmark::microbenchmark() :
#> Unit: milliseconds
#>           expr     min       lq       mean    median       uq      max neval
#>  lapply_subset  7.9087  8.46455   9.880332   8.72160   9.6610  24.2403   100
#>          floop 12.8077 13.43235  15.302831  13.80615  14.6502  35.7991   100
#>         lapply 91.1871 95.82220 101.641103 100.16710 104.2413 156.0194   100

my_list <- list(structure(list(col1 = c("v1", "v2", "v3", "V2", "V1"), col2 = c("wood", NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, -5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, -2L)), structure(list(col1 = c("v1", "v2", "v3", "V3"), col3 = c("cup", NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, -4L)))

set.seed(42)
big_list <- my_list[sample(1:3, 1e3, replace=TRUE)]

microbenchmark::microbenchmark(
  lapply_subset=lapply(big_list, df_update),
  floop={
    for (s in seq_along(big_list)) {
      x <- big_list[[s]]
      if ('col3' %in% names(x)) {
        x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
        big_list[[s]] <- x
      }
    }
    big_list
  },
  lapply=lapply(
    big_list,
    \(x) if ('col3' %in% names(x)) {
      transform(x, col3=replace(col3, 
                                is.na(col3) & col1 %in% c('v2', 'v3'), 
                                'VAL'))
    } else {
      x
    }),
  check='identical')

**jay.sf** · Accepted Answer · 2024-03-31T19:05:19.000000

In such cases for loops can be much faster.

> for (s in seq_along(my_list)) {
+   x <- my_list[[s]]
+   if ('col3' %in% names(x)) {
+     x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
+     my_list[[s]] <- x
+   }
+ }
> my_list
[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4
1   v1 wood <NA>
2   v2 <NA> pear

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>

Benchmark

Runs 80% faster, which is quite significant. Demonstrated on a list with just 1,000 elements.

$ Rscript --vanilla foo.R
Unit: milliseconds
   expr       min        lq      mean    median        uq       max neval cld
  floop  18.84617  19.95898  22.17803  22.08178  24.21662  27.02679   100  a 
 lapply 100.05645 106.24458 111.66269 111.17931 116.06089 150.08886   100   b

Benchmark code

set.seed(42)
big_list <- my_list[sample(1:3, 1e3, replace=TRUE)]

microbenchmark::microbenchmark(
  floop={
    for (s in seq_along(big_list)) {
      x <- big_list[[s]]
      if ('col3' %in% names(x)) {
        x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
        big_list[[s]] <- x
      }
    }
    big_list
  },
  lapply=lapply(
    big_list,
    \(x) if ('col3' %in% names(x)) {
      transform(x, col3=replace(col3, 
                                is.na(col3) & col1 %in% c('v2', 'v3'), 
                                'VAL'))
    } else {
      x
    }),
  check='identical')

Edit

Add a "col3" that is filled with "VAL" if none exists yet:

> for (s in seq_along(my_list)) {
+   x <- my_list[[s]]
+   if ('col3' %in% names(x)) {
+     x$col3[is.na(x$col3) & x$col1 %in% c('v2', 'v3')] <- 'VAL'
+   } else {
+     x$col3 <- 'VAL'
+   }
+   my_list[[s]] <- x
+ }
> my_list
[[1]]
  col1  col2 col3   col4
1   v1  wood  cup   <NA>
2   v2  <NA>  VAL   pear
3   v3 water fork banana
4   v2  <NA>  VAL   <NA>
5   v1 water <NA>  apple

[[2]]
  col1 col2 col4 col3
1   v1 wood <NA>  VAL
2   v2 <NA> pear  VAL

[[3]]
  col1 col3   col4
1   v1  cup   <NA>
2   v2  VAL   pear
3   v3  VAL banana
4   v3  VAL   <NA>

Data:

> dput(my_list)
list(structure(list(col1 = c("v1", "v2", "v3", "v2", "v1"), col2 = c("wood", 
NA, "water", NA, "water"), col3 = c("cup", NA, "fork", NA, NA
), col4 = c(NA, "pear", "banana", NA, "apple")), class = "data.frame", row.names = c(NA, 
-5L)), structure(list(col1 = c("v1", "v2"), col2 = c("wood", 
NA), col4 = c(NA, "pear")), class = "data.frame", row.names = c(NA, 
-2L)), structure(list(col1 = c("v1", "v2", "v3", "v3"), col3 = c("cup", 
NA, NA, NA), col4 = c(NA, "pear", "banana", NA)), class = "data.frame", row.names = c(NA, 
-4L)))

Replace NA in list of dfs in certain columns and under certain conditions

There are 3 best solutions below

Benchmark

Edit

Related Questions in R

Related Questions in LIST

Related Questions in MUTATE

Trending Questions

Popular # Hahtags

Popular Questions