Customizing {drake} plan with {data.table}

260 Views Asked by At

My goal is to customize {drake} plan to reduce duplication of codes for both simple and complex cases, for example, setting drake::trigger for multiple targets based on specific conditions of the plans (ie programmatically identify the targets to be set by detecting names of targets that end with (data) _raw when querying web API or other more complex cases). For brevity, the aspects of *programmatically identifying targets are not shown here and the codes below simply use names of columns typed out manually. I prefer to use {data.table} to manipulate the plans, instead of {dplyr}.

Is there a way, for example, to set drake::trigger for multiple targets? Or is there a better way to get the job done?

I came out with a simple solution below, which is to manually manipulate the drake plan with the familiar data.frame-style methods. Simplified version of reprex

library(drake)
library(data.table)
library(purrr)
#> 
#> Attaching package: 'purrr'
#> The following object is masked from 'package:data.table':
#> 
#>     transpose
library(magrittr)
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#> 
#>     set_names
library(rlang)
#> 
#> Attaching package: 'rlang'
#> The following object is masked from 'package:magrittr':
#> 
#>     set_names
#> The following objects are masked from 'package:purrr':
#> 
#>     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
#>     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
#>     splice
#> The following object is masked from 'package:data.table':
#> 
#>     :=

trigger_func <- function(target_name) do_something(target_name)
# Helper func
reset_drake_attr <- function(data, command_names = command_names, ...) {
  # Reset original 'class' of drake plan
  setattr(data, 'class', c('drake_plan', 'tbl_df', 'tbl', 'data.frame'))
  # Remove non-drake attributes created by data.table
  map( setdiff( names(attributes(data)),
                c('names', 'row.names', 'class') ),
       ~ setattr(data, . , NULL))
  # TODO Comment this out temporarily for testing
  # setattr(data$command, 'names', command_names)
  invisible()
}

plan <- drake_plan(
  a = 1,
  b = target(2, trigger = trigger(condition = trigger_func('b'))),
  c = 3, d = 4,
  e = target(5, trigger = trigger(condition = trigger_func('e')))
)

# Manipulate plan with data.table ----------------

my_plan <- drake_plan(a = 1, b = 2, c = 3, d = 4, e = 5)

command_names <- names(my_plan$command)      # For testing later
setDT(my_plan)[, trigger := .(.(expr(NA)))]  # Pre-populate targets without trigger

map(c('b', 'e'),
    ~ my_plan [
      ][target == . , trigger := .(expr(
          trigger(condition = trigger_func(!!.))
      ))])
#> [[1]]
#>    target command trigger
#> 1:      a       1      NA
#> 2:      b       2  <call>
#> 3:      c       3      NA
#> 4:      d       4      NA
#> 5:      e       5  <call>
#> 
#> [[2]]
#>    target command trigger
#> 1:      a       1      NA
#> 2:      b       2  <call>
#> 3:      c       3      NA
#> 4:      d       4      NA
#> 5:      e       5  <call>

reset_drake_attr(my_plan)

# Test equality ----------------------------------

plan
#> # A tibble: 5 x 3
#>   target command trigger                               
#>   <chr>  <expr>  <expr>                                
#> 1 a      1       NA                                    
#> 2 b      2       trigger(condition = trigger_func("b"))
#> 3 c      3       NA                                    
#> 4 d      4       NA                                    
#> 5 e      5       trigger(condition = trigger_func("e"))
my_plan
#> # A tibble: 5 x 3
#>   target command trigger                               
#>   <chr>  <expr>  <expr>                                
#> 1 a      1       NA                                    
#> 2 b      2       trigger(condition = trigger_func("b"))
#> 3 c      3       NA                                    
#> 4 d      4       NA                                    
#> 5 e      5       trigger(condition = trigger_func("e"))

identical(plan, my_plan)
#> [1] FALSE
all.equal(plan$command, my_plan$command)
#> [1] "names for target but not for current"
# Reason: 
names(plan$command)
#> [1] "a" ""  "c" "d" ""
command_names           # Saved earlier
#> [1] "a" "b" "c" "d" "e"
names(my_plan$command)  # data.table removes 'names' of 'my_plan$command'
#> NULL

# Can't test the exact equality of `identical(plan, my_plan)` because only targets without `target` have 'names' on 'command' column
drake_plan(
  # without `target`
  a = 1, b = 2,
  # with `target`
  c = target(3),
  d = target(4, trigger = trigger(condition = TRUE)),
  e = target(func(a), map(func = !!c('x', 'y')))
) %>%
  {names(.$command)}
#> [1] "a" "b" ""  ""  ""  ""

# Test without 'names' of 'command' column -------

identical( unname(plan$command), unname(my_plan$command) )
#> [1] TRUE
# Copy objects, remove 'names' of 'command' column and test
plan_test <- plan                ; my_plan_test <- my_plan
names(plan_test$command) <- NULL ; names(my_plan_test$command) <- NULL
identical(plan_test, my_plan_test)
#> [1] TRUE

Created on 2019-12-06 by the reprex package (v0.3.0)

Are names of my_plan$command necessary? Are they used by {drake} internally? I have three options and which options should I go for?

  • ignore
  • names(my_plan$command) <- my_plan$target
  • Save command_names <- names(my_plan$command) and reset it back (like this)
# Test with 'names' of 'command' column ----------

# --- NOT RUN ---
{
  # Put this in helper func, re-run `reset_drake_attr`
  setattr(my_plan$command, 'names', command_names)

  # (run manipulation steps...)

  reset_drake_attr(my_plan, command_names)  
  identical( names(my_plan$command), command_names )
  # TRUE 
}

0

There are 0 best solutions below