如何使用 tidymodels 掌握自动时间序列参数调优?
How to master automated time series parameter tuning using tidymodels?
因为我来自经典的时间序列分析方法,所以我对参数调整还是有点陌生。由于调整所有本地模型(在我的案例中有数百个产品需求的时间序列)结果甚至不接近可扩展性,我想首先分析调整具有低准确度值的时间序列的效果,以评估权衡在可扩展性和准确性之间,以查看针对特定时间序列问题进行调整是否合理。
当我 运行 下面的代码时,似乎我没有正确指定常规网格的范围。我认为是这样,因为它看起来很奇怪,只检索范围从 50 到 2000 的树值的三个组合。这是标准行为吗?在这种情况下,更改 levels 参数是否有帮助?在我的情况下它没有改变任何东西。另外,有没有办法检索最佳的重采样折叠数,而不是猜测?我希望得到一些建议或有用的例子。
提前致谢!
# data and libs
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(timetk))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
suppressPackageStartupMessages(library(tictoc))
suppressPackageStartupMessages(library(readxl))
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-c(296,325,339,812,723,310,842,500,555,260,243,306,204,330,467,713,1054,827,75,437 ,558,222,350,139,306,395,472,741,1020,903,837,738,676,506,199,219,342,406,417 ,977,1503,117,942,843,716,378,267,392,329,369,536,1168,1260,1066,949,906,1744,2495,418,447)
df <- bind_cols(fake_values, dates) %>%
rename(c(y = ...1, ds = ...2))
# training- and test set
data_splits <- initial_time_split(df, prop = 0.8)
data_train <- training(data_splits)
data_test <- testing(data_splits)
# plot cv
split_obj<- time_series_split(df, assess = "1 year", cumulative = TRUE)
split_obj %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(ds, y)
# Resample - CV plan
resampling_strategy <-
data_train %>%
time_series_cv(
initial = "36 months",
assess = "12 months",
skip = "12 months",
cumulative = TRUE
)
# model spec
prophet_boost_model <- prophet_boost(
mode = "regression",
growth = "linear",
changepoint_num =tune(),
changepoint_range = tune(),
trees = tune()
) %>%
set_engine("prophet_xgboost")
# regular grid
prophet_grid <- grid_regular(
changepoint_num(range = c(1L, 45L)),
changepoint_range(range = c(0.5, 0.9)),
trees(range = c(50,2000))
#, levels = 10
#, size = 100
)
# recipe
basic_rec <- recipe(y ~ ds, data = data_train)
# wf
wflw_spec_tune_prophet <- workflow() %>%
add_model(prophet_boost_model) %>%
add_recipe(basic_rec)
# parallel proc
#cores <- parallel::detectCores(logical = FALSE)
library(doParallel)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
# automated tuning
tic()
tune_results <-
wflw_spec_tune_prophet %>%
tune_grid(
resamples = resampling_strategy,
grid = prophet_grid,
metrics = metric_set(rmse, mae))
toc()
stopCluster(cl)
# save the best model
best_results <- tune_results %>%
show_best(metric = 'rmse',n = 20)
best_results
best_results$mean
你说得对!更改 grid_regular()
中的 levels
参数是您可以增加参数数量以在您的范围内尝试的方法。以下是一些示例 - 希望对您有所帮助!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# levels will default to 3 for each tuned paramater
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10))
)
#> # A tibble: 9 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 1025 1
#> 3 2000 1
#> 4 50 5
#> 5 1025 5
#> 6 2000 5
#> 7 50 10
#> 8 1025 10
#> 9 2000 10
# you can also specify the number of levels!
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10)),
levels = 5
)
#> # A tibble: 25 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 537 1
#> 3 1025 1
#> 4 1512 1
#> 5 2000 1
#> 6 50 3
#> 7 537 3
#> 8 1025 3
#> 9 1512 3
#> 10 2000 3
#> # ... with 15 more rows
# or, if you want to, you can specify different number
# of tuning parameters to try for each by creating a vector
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10)),
levels = c(3, 2)
)
#> # A tibble: 6 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 1025 1
#> 3 2000 1
#> 4 50 10
#> 5 1025 10
#> 6 2000 10
由 reprex package (v2.0.1)
于 2021-10-18 创建
因为我来自经典的时间序列分析方法,所以我对参数调整还是有点陌生。由于调整所有本地模型(在我的案例中有数百个产品需求的时间序列)结果甚至不接近可扩展性,我想首先分析调整具有低准确度值的时间序列的效果,以评估权衡在可扩展性和准确性之间,以查看针对特定时间序列问题进行调整是否合理。 当我 运行 下面的代码时,似乎我没有正确指定常规网格的范围。我认为是这样,因为它看起来很奇怪,只检索范围从 50 到 2000 的树值的三个组合。这是标准行为吗?在这种情况下,更改 levels 参数是否有帮助?在我的情况下它没有改变任何东西。另外,有没有办法检索最佳的重采样折叠数,而不是猜测?我希望得到一些建议或有用的例子。
提前致谢!
# data and libs
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(timetk))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
suppressPackageStartupMessages(library(tictoc))
suppressPackageStartupMessages(library(readxl))
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-c(296,325,339,812,723,310,842,500,555,260,243,306,204,330,467,713,1054,827,75,437 ,558,222,350,139,306,395,472,741,1020,903,837,738,676,506,199,219,342,406,417 ,977,1503,117,942,843,716,378,267,392,329,369,536,1168,1260,1066,949,906,1744,2495,418,447)
df <- bind_cols(fake_values, dates) %>%
rename(c(y = ...1, ds = ...2))
# training- and test set
data_splits <- initial_time_split(df, prop = 0.8)
data_train <- training(data_splits)
data_test <- testing(data_splits)
# plot cv
split_obj<- time_series_split(df, assess = "1 year", cumulative = TRUE)
split_obj %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(ds, y)
# Resample - CV plan
resampling_strategy <-
data_train %>%
time_series_cv(
initial = "36 months",
assess = "12 months",
skip = "12 months",
cumulative = TRUE
)
# model spec
prophet_boost_model <- prophet_boost(
mode = "regression",
growth = "linear",
changepoint_num =tune(),
changepoint_range = tune(),
trees = tune()
) %>%
set_engine("prophet_xgboost")
# regular grid
prophet_grid <- grid_regular(
changepoint_num(range = c(1L, 45L)),
changepoint_range(range = c(0.5, 0.9)),
trees(range = c(50,2000))
#, levels = 10
#, size = 100
)
# recipe
basic_rec <- recipe(y ~ ds, data = data_train)
# wf
wflw_spec_tune_prophet <- workflow() %>%
add_model(prophet_boost_model) %>%
add_recipe(basic_rec)
# parallel proc
#cores <- parallel::detectCores(logical = FALSE)
library(doParallel)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)
# automated tuning
tic()
tune_results <-
wflw_spec_tune_prophet %>%
tune_grid(
resamples = resampling_strategy,
grid = prophet_grid,
metrics = metric_set(rmse, mae))
toc()
stopCluster(cl)
# save the best model
best_results <- tune_results %>%
show_best(metric = 'rmse',n = 20)
best_results
best_results$mean
你说得对!更改 grid_regular()
中的 levels
参数是您可以增加参数数量以在您的范围内尝试的方法。以下是一些示例 - 希望对您有所帮助!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# levels will default to 3 for each tuned paramater
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10))
)
#> # A tibble: 9 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 1025 1
#> 3 2000 1
#> 4 50 5
#> 5 1025 5
#> 6 2000 5
#> 7 50 10
#> 8 1025 10
#> 9 2000 10
# you can also specify the number of levels!
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10)),
levels = 5
)
#> # A tibble: 25 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 537 1
#> 3 1025 1
#> 4 1512 1
#> 5 2000 1
#> 6 50 3
#> 7 537 3
#> 8 1025 3
#> 9 1512 3
#> 10 2000 3
#> # ... with 15 more rows
# or, if you want to, you can specify different number
# of tuning parameters to try for each by creating a vector
grid_regular(
trees(range = c(50, 2000)),
mtry(range = c(1, 10)),
levels = c(3, 2)
)
#> # A tibble: 6 x 2
#> trees mtry
#> <int> <int>
#> 1 50 1
#> 2 1025 1
#> 3 2000 1
#> 4 50 10
#> 5 1025 10
#> 6 2000 10
由 reprex package (v2.0.1)
于 2021-10-18 创建