如何使用动态文件更新 drake 目录
How to use dynamic files to update drake directory
我想确保当我将新的 .csv
文件添加到目录时,我的 drake
计划会更新。我查看了新的动态文件,但无法让它工作(参见 reprex)。
library(drake)
library(purrr)
library(readr)
fs::dir_create("folder")
file.create("folder/file1.csv")
#> [1] TRUE
# Single dynamic file
plan_base <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file"
),
downstream = map_dfr(upstream, read_csv)
)
make(plan_base)
#> ▶ target upstream
#> ▶ target downstream
#> Warning: `data_frame()` is deprecated as of tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.
# Updates when changing the file
write_csv(mtcars, "folder/file1.csv")
plan_update <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file"
),
downstream = map_dfr(upstream, read_csv)
)
make(plan_update)
#> ▶ target upstream
#> ▶ target downstream
#> Parsed with column specification:
#> cols(
#> mpg = col_double(),
#> cyl = col_double(),
#> disp = col_double(),
#> hp = col_double(),
#> drat = col_double(),
#> wt = col_double(),
#> qsec = col_double(),
#> vs = col_double(),
#> am = col_double(),
#> gear = col_double(),
#> carb = col_double()
#> )
# Doesn't update when adding file to directory
file.create("folder/file2.csv")
#> [1] TRUE
plan_no_update <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file"
),
downstream = map_dfr(upstream, read_csv)
)
make(plan_no_update)
#> ✓ All targets are already up to date.
由 reprex package (v0.3.0)
于 2020-07-17 创建
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.0.1 (2020-06-06)
#> os macOS Mojave 10.14.6
#> system x86_64, darwin17.0
#> ui X11
#> language (EN)
#> collate en_AU.UTF-8
#> ctype en_AU.UTF-8
#> tz Australia/Sydney
#> date 2020-07-17
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.1.8 2020-06-17 [1] CRAN (R 4.0.0)
#> base64url 1.4 2018-05-14 [1] CRAN (R 4.0.0)
#> callr 3.4.3 2020-03-28 [1] CRAN (R 4.0.0)
#> cli 2.0.2 2020-02-28 [1] CRAN (R 4.0.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.0)
#> devtools 2.3.0 2020-04-10 [1] CRAN (R 4.0.0)
#> digest 0.6.25 2020-02-23 [1] CRAN (R 4.0.0)
#> dplyr 1.0.0 2020-05-29 [1] CRAN (R 4.0.0)
#> drake * 7.12.2 2020-06-02 [1] CRAN (R 4.0.0)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.0)
#> filelock 1.0.2 2018-10-05 [1] CRAN (R 4.0.0)
#> fs 1.4.1 2020-04-04 [1] CRAN (R 4.0.0)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.0)
#> glue 1.4.1 2020-05-13 [1] CRAN (R 4.0.0)
#> highr 0.8 2019-03-20 [1] CRAN (R 4.0.0)
#> hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.0)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.0)
#> igraph 1.2.5 2020-03-19 [1] CRAN (R 4.0.0)
#> knitr 1.29 2020-06-23 [1] CRAN (R 4.0.0)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.0)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 4.0.0)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.1)
#> pkgbuild 1.0.8 2020-05-07 [1] CRAN (R 4.0.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.0)
#> processx 3.4.2 2020-02-09 [1] CRAN (R 4.0.0)
#> progress 1.2.2 2019-05-16 [1] CRAN (R 4.0.0)
#> ps 1.3.3 2020-05-08 [1] CRAN (R 4.0.0)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> R6 2.4.1 2019-11-12 [1] CRAN (R 4.0.0)
#> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.2)
#> readr * 1.3.1 2018-12-21 [1] CRAN (R 4.0.0)
#> remotes 2.1.1 2020-02-15 [1] CRAN (R 4.0.0)
#> rlang 0.4.7 2020-07-09 [1] CRAN (R 4.0.1)
#> rmarkdown 2.3 2020-06-18 [1] CRAN (R 4.0.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> storr 1.2.1 2018-10-18 [1] CRAN (R 4.0.0)
#> stringi 1.4.6 2020-02-17 [1] CRAN (R 4.0.0)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> testthat 2.3.2 2020-03-02 [1] CRAN (R 4.0.0)
#> tibble 3.0.3 2020-07-10 [1] CRAN (R 4.0.1)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.0)
#> txtq 0.2.0 2019-10-15 [1] CRAN (R 4.0.0)
#> usethis 1.6.1 2020-04-29 [1] CRAN (R 4.0.0)
#> vctrs 0.3.2 2020-07-15 [1] CRAN (R 4.0.1)
#> withr 2.2.0 2020-04-20 [1] CRAN (R 4.0.0)
#> xfun 0.15 2020-06-21 [1] CRAN (R 4.0.0)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#>
#> [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library
目录也是动态文件,所以如果跟踪整个目录,实际上不需要列出具体内容。试试这个:
plan <- drake_plan(
upstream = target("folder", format = "file"),
downstream = map_dfr(upstream, read_csv)
)
有替代方案,但它们更复杂。一种是将 condition
触发器设置为 TRUE
,这样目标总是 运行s,但这效率不高,因为即使文件很大,它也会重新计算哈希值。
plan_no_update <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file",
trigger = trigger(condition = TRUE)
),
downstream = map_dfr(upstream, read_csv)
)
另一种方法是首先将文件定义为计划外的全局对象,这样它们总是在您之前刷新 运行 make()
.
plan <- drake_plan(
upstream = target(files, format = "file"),
downstream = map_dfr(upstream, read_csv)
)
files <- list.files("folder", full.names = TRUE)
make(plan)
我想确保当我将新的 .csv
文件添加到目录时,我的 drake
计划会更新。我查看了新的动态文件,但无法让它工作(参见 reprex)。
library(drake)
library(purrr)
library(readr)
fs::dir_create("folder")
file.create("folder/file1.csv")
#> [1] TRUE
# Single dynamic file
plan_base <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file"
),
downstream = map_dfr(upstream, read_csv)
)
make(plan_base)
#> ▶ target upstream
#> ▶ target downstream
#> Warning: `data_frame()` is deprecated as of tibble 1.1.0.
#> Please use `tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.
# Updates when changing the file
write_csv(mtcars, "folder/file1.csv")
plan_update <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file"
),
downstream = map_dfr(upstream, read_csv)
)
make(plan_update)
#> ▶ target upstream
#> ▶ target downstream
#> Parsed with column specification:
#> cols(
#> mpg = col_double(),
#> cyl = col_double(),
#> disp = col_double(),
#> hp = col_double(),
#> drat = col_double(),
#> wt = col_double(),
#> qsec = col_double(),
#> vs = col_double(),
#> am = col_double(),
#> gear = col_double(),
#> carb = col_double()
#> )
# Doesn't update when adding file to directory
file.create("folder/file2.csv")
#> [1] TRUE
plan_no_update <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file"
),
downstream = map_dfr(upstream, read_csv)
)
make(plan_no_update)
#> ✓ All targets are already up to date.
由 reprex package (v0.3.0)
于 2020-07-17 创建devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.0.1 (2020-06-06)
#> os macOS Mojave 10.14.6
#> system x86_64, darwin17.0
#> ui X11
#> language (EN)
#> collate en_AU.UTF-8
#> ctype en_AU.UTF-8
#> tz Australia/Sydney
#> date 2020-07-17
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.1.8 2020-06-17 [1] CRAN (R 4.0.0)
#> base64url 1.4 2018-05-14 [1] CRAN (R 4.0.0)
#> callr 3.4.3 2020-03-28 [1] CRAN (R 4.0.0)
#> cli 2.0.2 2020-02-28 [1] CRAN (R 4.0.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.0)
#> devtools 2.3.0 2020-04-10 [1] CRAN (R 4.0.0)
#> digest 0.6.25 2020-02-23 [1] CRAN (R 4.0.0)
#> dplyr 1.0.0 2020-05-29 [1] CRAN (R 4.0.0)
#> drake * 7.12.2 2020-06-02 [1] CRAN (R 4.0.0)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.0)
#> filelock 1.0.2 2018-10-05 [1] CRAN (R 4.0.0)
#> fs 1.4.1 2020-04-04 [1] CRAN (R 4.0.0)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.0)
#> glue 1.4.1 2020-05-13 [1] CRAN (R 4.0.0)
#> highr 0.8 2019-03-20 [1] CRAN (R 4.0.0)
#> hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.0)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.0)
#> igraph 1.2.5 2020-03-19 [1] CRAN (R 4.0.0)
#> knitr 1.29 2020-06-23 [1] CRAN (R 4.0.0)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.0)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 4.0.0)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.1)
#> pkgbuild 1.0.8 2020-05-07 [1] CRAN (R 4.0.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.0)
#> processx 3.4.2 2020-02-09 [1] CRAN (R 4.0.0)
#> progress 1.2.2 2019-05-16 [1] CRAN (R 4.0.0)
#> ps 1.3.3 2020-05-08 [1] CRAN (R 4.0.0)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> R6 2.4.1 2019-11-12 [1] CRAN (R 4.0.0)
#> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.2)
#> readr * 1.3.1 2018-12-21 [1] CRAN (R 4.0.0)
#> remotes 2.1.1 2020-02-15 [1] CRAN (R 4.0.0)
#> rlang 0.4.7 2020-07-09 [1] CRAN (R 4.0.1)
#> rmarkdown 2.3 2020-06-18 [1] CRAN (R 4.0.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> storr 1.2.1 2018-10-18 [1] CRAN (R 4.0.0)
#> stringi 1.4.6 2020-02-17 [1] CRAN (R 4.0.0)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> testthat 2.3.2 2020-03-02 [1] CRAN (R 4.0.0)
#> tibble 3.0.3 2020-07-10 [1] CRAN (R 4.0.1)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.0)
#> txtq 0.2.0 2019-10-15 [1] CRAN (R 4.0.0)
#> usethis 1.6.1 2020-04-29 [1] CRAN (R 4.0.0)
#> vctrs 0.3.2 2020-07-15 [1] CRAN (R 4.0.1)
#> withr 2.2.0 2020-04-20 [1] CRAN (R 4.0.0)
#> xfun 0.15 2020-06-21 [1] CRAN (R 4.0.0)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#>
#> [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library
目录也是动态文件,所以如果跟踪整个目录,实际上不需要列出具体内容。试试这个:
plan <- drake_plan(
upstream = target("folder", format = "file"),
downstream = map_dfr(upstream, read_csv)
)
有替代方案,但它们更复杂。一种是将 condition
触发器设置为 TRUE
,这样目标总是 运行s,但这效率不高,因为即使文件很大,它也会重新计算哈希值。
plan_no_update <- drake_plan(
upstream = target(
list.files("folder", full.names = TRUE),
format = "file",
trigger = trigger(condition = TRUE)
),
downstream = map_dfr(upstream, read_csv)
)
另一种方法是首先将文件定义为计划外的全局对象,这样它们总是在您之前刷新 运行 make()
.
plan <- drake_plan(
upstream = target(files, format = "file"),
downstream = map_dfr(upstream, read_csv)
)
files <- list.files("folder", full.names = TRUE)
make(plan)