如何使用动态文件更新 drake 目录

How to use dynamic files to update drake directory

我想确保当我将新的 .csv 文件添加到目录时,我的 drake 计划会更新。我查看了新的动态文件,但无法让它工作(参见 reprex)。

library(drake)
library(purrr)
library(readr)
fs::dir_create("folder")
file.create("folder/file1.csv")
#> [1] TRUE

# Single dynamic file
plan_base <- drake_plan(
  upstream = target(
    list.files("folder", full.names = TRUE),
  format = "file"
),
downstream = map_dfr(upstream, read_csv)
)

make(plan_base)
#> ▶ target upstream
#> ▶ target downstream
#> Warning: `data_frame()` is deprecated as of tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.

# Updates when changing the file
write_csv(mtcars, "folder/file1.csv")

plan_update <- drake_plan(
  upstream = target(
    list.files("folder", full.names = TRUE),
    format = "file"
  ),
  downstream = map_dfr(upstream, read_csv)
)

make(plan_update)
#> ▶ target upstream
#> ▶ target downstream
#> Parsed with column specification:
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )

# Doesn't update when adding file to directory
file.create("folder/file2.csv")
#> [1] TRUE

plan_no_update <- drake_plan(
  upstream = target(
    list.files("folder", full.names = TRUE),
    format = "file"
  ),
  downstream = map_dfr(upstream, read_csv)
)

make(plan_no_update)
#> ✓ All targets are already up to date.

reprex package (v0.3.0)

于 2020-07-17 创建
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.0.1 (2020-06-06)
#>  os       macOS Mojave 10.14.6        
#>  system   x86_64, darwin17.0          
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_AU.UTF-8                 
#>  ctype    en_AU.UTF-8                 
#>  tz       Australia/Sydney            
#>  date     2020-07-17                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date       lib source        
#>  assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.0.0)
#>  backports     1.1.8   2020-06-17 [1] CRAN (R 4.0.0)
#>  base64url     1.4     2018-05-14 [1] CRAN (R 4.0.0)
#>  callr         3.4.3   2020-03-28 [1] CRAN (R 4.0.0)
#>  cli           2.0.2   2020-02-28 [1] CRAN (R 4.0.0)
#>  crayon        1.3.4   2017-09-16 [1] CRAN (R 4.0.0)
#>  desc          1.2.0   2018-05-01 [1] CRAN (R 4.0.0)
#>  devtools      2.3.0   2020-04-10 [1] CRAN (R 4.0.0)
#>  digest        0.6.25  2020-02-23 [1] CRAN (R 4.0.0)
#>  dplyr         1.0.0   2020-05-29 [1] CRAN (R 4.0.0)
#>  drake       * 7.12.2  2020-06-02 [1] CRAN (R 4.0.0)
#>  ellipsis      0.3.1   2020-05-15 [1] CRAN (R 4.0.0)
#>  evaluate      0.14    2019-05-28 [1] CRAN (R 4.0.0)
#>  fansi         0.4.1   2020-01-08 [1] CRAN (R 4.0.0)
#>  filelock      1.0.2   2018-10-05 [1] CRAN (R 4.0.0)
#>  fs            1.4.1   2020-04-04 [1] CRAN (R 4.0.0)
#>  generics      0.0.2   2018-11-29 [1] CRAN (R 4.0.0)
#>  glue          1.4.1   2020-05-13 [1] CRAN (R 4.0.0)
#>  highr         0.8     2019-03-20 [1] CRAN (R 4.0.0)
#>  hms           0.5.3   2020-01-08 [1] CRAN (R 4.0.0)
#>  htmltools     0.5.0   2020-06-16 [1] CRAN (R 4.0.0)
#>  igraph        1.2.5   2020-03-19 [1] CRAN (R 4.0.0)
#>  knitr         1.29    2020-06-23 [1] CRAN (R 4.0.0)
#>  lifecycle     0.2.0   2020-03-06 [1] CRAN (R 4.0.0)
#>  magrittr      1.5     2014-11-22 [1] CRAN (R 4.0.0)
#>  memoise       1.1.0   2017-04-21 [1] CRAN (R 4.0.0)
#>  pillar        1.4.6   2020-07-10 [1] CRAN (R 4.0.1)
#>  pkgbuild      1.0.8   2020-05-07 [1] CRAN (R 4.0.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.0.0)
#>  pkgload       1.1.0   2020-05-29 [1] CRAN (R 4.0.0)
#>  prettyunits   1.1.1   2020-01-24 [1] CRAN (R 4.0.0)
#>  processx      3.4.2   2020-02-09 [1] CRAN (R 4.0.0)
#>  progress      1.2.2   2019-05-16 [1] CRAN (R 4.0.0)
#>  ps            1.3.3   2020-05-08 [1] CRAN (R 4.0.0)
#>  purrr       * 0.3.4   2020-04-17 [1] CRAN (R 4.0.0)
#>  R6            2.4.1   2019-11-12 [1] CRAN (R 4.0.0)
#>  Rcpp          1.0.5   2020-07-06 [1] CRAN (R 4.0.2)
#>  readr       * 1.3.1   2018-12-21 [1] CRAN (R 4.0.0)
#>  remotes       2.1.1   2020-02-15 [1] CRAN (R 4.0.0)
#>  rlang         0.4.7   2020-07-09 [1] CRAN (R 4.0.1)
#>  rmarkdown     2.3     2020-06-18 [1] CRAN (R 4.0.0)
#>  rprojroot     1.3-2   2018-01-03 [1] CRAN (R 4.0.0)
#>  sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 4.0.0)
#>  storr         1.2.1   2018-10-18 [1] CRAN (R 4.0.0)
#>  stringi       1.4.6   2020-02-17 [1] CRAN (R 4.0.0)
#>  stringr       1.4.0   2019-02-10 [1] CRAN (R 4.0.0)
#>  testthat      2.3.2   2020-03-02 [1] CRAN (R 4.0.0)
#>  tibble        3.0.3   2020-07-10 [1] CRAN (R 4.0.1)
#>  tidyselect    1.1.0   2020-05-11 [1] CRAN (R 4.0.0)
#>  txtq          0.2.0   2019-10-15 [1] CRAN (R 4.0.0)
#>  usethis       1.6.1   2020-04-29 [1] CRAN (R 4.0.0)
#>  vctrs         0.3.2   2020-07-15 [1] CRAN (R 4.0.1)
#>  withr         2.2.0   2020-04-20 [1] CRAN (R 4.0.0)
#>  xfun          0.15    2020-06-21 [1] CRAN (R 4.0.0)
#>  yaml          2.2.1   2020-02-01 [1] CRAN (R 4.0.0)
#> 
#> [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library

目录也是动态文件,所以如果跟踪整个目录,实际上不需要列出具体内容。试试这个:

plan <- drake_plan(
  upstream = target("folder", format = "file"),
  downstream = map_dfr(upstream, read_csv)
)

有替代方案,但它们更复杂。一种是将 condition 触发器设置为 TRUE,这样目标总是 运行s,但这效率不高,因为即使文件很大,它也会重新计算哈希值。

plan_no_update <- drake_plan(
  upstream = target(
    list.files("folder", full.names = TRUE),
    format = "file",
    trigger = trigger(condition = TRUE)
  ),
  downstream = map_dfr(upstream, read_csv)
)

另一种方法是首先将文件定义为计划外的全局对象,这样它们总是在您之前刷新 运行 make().

plan <- drake_plan(
  upstream = target(files, format = "file"),
  downstream = map_dfr(upstream, read_csv)
)
files <- list.files("folder", full.names = TRUE)
make(plan)