How to optimize the following conditional assignment of a vector?

118 Views Asked by At

I have to classify a lot of crops based on three conditions calculated in a grid of 1e6 points. I'm trying to optimize the function below (hopefully without moving to C or Rust). Any ideas?

Iit's possible to reformat the input data if necessary. I already tried with data.table but the performance was worse.

This is my best shot:

condtion1 <- letters[1:8]
condtion2 <- letters[9:15]
condtion3 <- letters[16:24]

crop <- sample(0:1, 24, replace = T)
names(crop) <- letters[1:24]

n <- 1e6

condtions1 <- sample(condtion1, n, replace = T)
condtions2 <- sample(condtion2, n, replace = T)
condtions3 <- sample(condtion3, n, replace = T)

get_suitability <- function(){
  result <- character(n)
  
  for (i in seq_along(result)) {
    if (crop[[condtions1[[i]]]] == 0 | crop[[condtions2[[i]]]] == 0) result[[i]] <- "not suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
  }
  result
}


microbenchmark::microbenchmark(
  get_suitability(),
  times = 5
)
#> Unit: seconds
#>               expr      min       lq     mean   median       uq      max neval
#>  get_suitability() 2.402434 2.408322 2.568981 2.641211 2.667943 2.724993     5

Created on 2024-03-24 with reprex v2.1.0

2

There are 2 best solutions below

1
Rui Barradas On BEST ANSWER

Vectorise over the condtions getting rid of for/if. The logical indices take care of both for and if.
In a comment to the question I write:

You can initialize result <- rep("not suitable", n) and remove one of the if's from the loop.

Notes:

  • get_suitability2 is my idea in comment to the question, a bad idea as it turned out;
  • get_suitability3b is a simplified version of get_suitability3 and the fastest of all;
  • get_suitability4 is user2554330´s last function and faster than the original question code.
condtion1 <- letters[1:8]
condtion2 <- letters[9:15]
condtion3 <- letters[16:24]

crop <- sample(0:1, 24, replace = T)
names(crop) <- letters[1:24]

n <- 1e6

condtions1 <- sample(condtion1, n, replace = T)
condtions2 <- sample(condtion2, n, replace = T)
condtions3 <- sample(condtion3, n, replace = T)

get_suitability <- function(){
  result <- character(n)
  
  for (i in seq_along(result)) {
    if (crop[[condtions1[[i]]]] == 0 | crop[[condtions2[[i]]]] == 0) result[[i]] <- "not suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
  }
  result
}
get_suitability2 <- function(){
  result <- rep("not suitable", n)
  for (i in seq_along(result)) {
    if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
  }
  result
}
get_suitability3 <- function(){
  result <- rep("not suitable", n)
  i1 <- crop[ condtions1 ] == 1 
  i2 <- crop[ condtions2 ] == 1
  i3 <- crop[ condtions3 ] == 1
  result[i1 & i2 & i3] <- "suitable"
  result[i1 & i2 & !i3] <- "suitable with irrigation"
  result
}
get_suitability3b <- function(){
  result <- rep("not suitable", n)
  i1 <- crop[ condtions1 ] == 1 & crop[ condtions2 ] == 1
  i3 <- crop[ condtions3 ] == 1
  result[i1 & i3] <- "suitable"
  result[i1 & !i3] <- "suitable with irrigation"
  result
}
get_suitability4 <- function(){
  result <- ifelse(crop[condtions1] == 0 | 
                     crop[condtions2] == 0, "not suitable",
                   ifelse(crop[condtions3] == 1, "suitable", 
                          "suitable with irrigation"))
  names(result) <- NULL
  result
}

library(microbenchmark)

res <- get_suitability()
res2 <- get_suitability2()
res3 <- get_suitability3()
res3b <- get_suitability3b()
res4 <- get_suitability4()

identical(res, res2)
#> [1] TRUE
identical(res, res3)
#> [1] TRUE
identical(res, res3b)
#> [1] TRUE
identical(res, res4)
#> [1] TRUE

mb <- microbenchmark(
  get_suitability(),
  get_suitability2(),
  get_suitability3(),
  get_suitability3b(),
  get_suitability4(),
  times = 5L
)
print(mb, order = "median")
#> Unit: milliseconds
#>                 expr       min        lq      mean    median        uq
#>  get_suitability3b()  120.5004  123.8272  144.3827  137.7121  158.9400
#>   get_suitability3()  130.9886  141.4570  158.9099  154.2719  179.9035
#>   get_suitability4()  630.0646  651.2294  677.3693  687.7445  703.8762
#>    get_suitability() 1496.4989 1522.9126 1540.5882 1535.8001 1566.6336
#>   get_suitability2() 2999.3825 3008.2696 3064.8530 3083.5560 3102.7165
#>        max neval  cld
#>   180.9339     5   c 
#>   187.9287     5   c 
#>   713.9316     5    d
#>  1581.0956     5 a   
#>  3130.3405     5  b

Created on 2024-03-24 with reprex v2.1.0

0
user2554330 On

This looks like a case where ifelse() would be suitable. For example, this function is quite a bit faster than yours:

get_suitability2 <- function(){
  result <- ifelse(crop[condtions1] == 0 | 
                   crop[condtions2] == 0, "not suitable",
            ifelse(crop[condtions1] == 1 & 
                   crop[condtions2] == 1 & 
                   crop[condtions3] == 1, "suitable", 
            ifelse(crop[condtions1] == 1 & 
                   crop[condtions2] == 1 & 
                   crop[condtions3] == 0, "suitable with irrigation", "")))
  names(result) <- NULL
  result
}

More improvement is possible. Some of your tests are redundant, so you could remove them. After the first test determines that "not suitable" is not the answer, you don't need to look at condtions1 or condtions2 again: they are known to be 1. And the final test is guaranteed to be true. So you could simplify to

get_suitability3 <- function(){
  result <- ifelse(crop[condtions1] == 0 | 
                   crop[condtions2] == 0, "not suitable",
            ifelse(crop[condtions3] == 1, "suitable", 
            "suitable with irrigation"))
  names(result) <- NULL
  result
}