如何使用 tidymodels 掌握自动时间序列参数调优?

How to master automated time series parameter tuning using tidymodels?

因为我来自经典的时间序列分析方法,所以我对参数调整还是有点陌生​​。由于调整所有本地模型(在我的案例中有数百个产品需求的时间序列)结果甚至不接近可扩展性,我想首先分析调整具有低准确度值的时间序列的效果,以评估权衡在可扩展性和准确性之间,以查看针对特定时间序列问题进行调整是否合理。 当我 运行 下面的代码时,似乎我没有正确指定常规网格的范围。我认为是这样,因为它看起来很奇怪,只检索范围从 50 到 2000 的树值的三个组合。这是标准行为吗?在这种情况下,更改 levels 参数是否有帮助?在我的情况下它没有改变任何东西。另外,有没有办法检索最佳的重采样折叠数,而不是猜测?我希望得到一些建议或有用的例子。

提前致谢!

# data and libs
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(timetk))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
suppressPackageStartupMessages(library(tictoc))
suppressPackageStartupMessages(library(readxl))

dates <- ymd("2016-01-01")+ months(0:59)
fake_values <-c(296,325,339,812,723,310,842,500,555,260,243,306,204,330,467,713,1054,827,75,437 ,558,222,350,139,306,395,472,741,1020,903,837,738,676,506,199,219,342,406,417 ,977,1503,117,942,843,716,378,267,392,329,369,536,1168,1260,1066,949,906,1744,2495,418,447)
df <- bind_cols(fake_values, dates) %>%
    rename(c(y = ...1, ds = ...2))



# training- and test set
data_splits <- initial_time_split(df, prop = 0.8)
data_train  <- training(data_splits)
data_test   <- testing(data_splits)

# plot cv
split_obj<- time_series_split(df, assess = "1 year", cumulative = TRUE)       
split_obj %>%
    tk_time_series_cv_plan() %>%
    plot_time_series_cv_plan(ds, y)

# Resample - CV plan
resampling_strategy <- 
    data_train %>%
    time_series_cv(
        initial = "36 months",
        assess = "12 months",
        skip = "12 months",
        cumulative = TRUE
)


# model spec
prophet_boost_model <- prophet_boost(
    mode = "regression",
    growth = "linear",
    changepoint_num =tune(),
    changepoint_range = tune(),
    trees = tune()
) %>% 
    set_engine("prophet_xgboost")

# regular grid
prophet_grid <- grid_regular(
    changepoint_num(range = c(1L, 45L)),
    changepoint_range(range = c(0.5, 0.9)),
    trees(range = c(50,2000))
    #, levels = 10
    #, size = 100
)

# recipe
basic_rec <- recipe(y ~ ds, data = data_train)

# wf
wflw_spec_tune_prophet <- workflow() %>%
    add_model(prophet_boost_model) %>%
    add_recipe(basic_rec)

# parallel proc
#cores <- parallel::detectCores(logical = FALSE)
library(doParallel)
cl <- makePSOCKcluster(4)
registerDoParallel(cl)


# automated tuning
tic()
tune_results <-
    wflw_spec_tune_prophet %>% 
    tune_grid(
        resamples = resampling_strategy,
        grid = prophet_grid,
        metrics = metric_set(rmse, mae))
toc()
stopCluster(cl)

# save the best model
best_results <- tune_results %>%
    show_best(metric = 'rmse',n = 20)
best_results
best_results$mean

你说得对!更改 grid_regular() 中的 levels 参数是您可以增加参数数量以在您的范围内尝试的方法。以下是一些示例 - 希望对您有所帮助!

library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip

# levels will default to 3 for each tuned paramater
grid_regular(
  trees(range = c(50, 2000)),
  mtry(range = c(1, 10))
)
#> # A tibble: 9 x 2
#>   trees  mtry
#>   <int> <int>
#> 1    50     1
#> 2  1025     1
#> 3  2000     1
#> 4    50     5
#> 5  1025     5
#> 6  2000     5
#> 7    50    10
#> 8  1025    10
#> 9  2000    10

# you can also specify the number of levels!
grid_regular(
  trees(range = c(50, 2000)),
  mtry(range = c(1, 10)),
  levels = 5
)
#> # A tibble: 25 x 2
#>    trees  mtry
#>    <int> <int>
#>  1    50     1
#>  2   537     1
#>  3  1025     1
#>  4  1512     1
#>  5  2000     1
#>  6    50     3
#>  7   537     3
#>  8  1025     3
#>  9  1512     3
#> 10  2000     3
#> # ... with 15 more rows

# or, if you want to, you can specify different number
# of tuning parameters to try for each by creating a vector
grid_regular(
  trees(range = c(50, 2000)),
  mtry(range = c(1, 10)),
  levels = c(3, 2)
)
#> # A tibble: 6 x 2
#>   trees  mtry
#>   <int> <int>
#> 1    50     1
#> 2  1025     1
#> 3  2000     1
#> 4    50    10
#> 5  1025    10
#> 6  2000    10

reprex package (v2.0.1)

于 2021-10-18 创建