如何从模型时间递归集合中提取置信区间?
How to extract confidence intervals from modeltime recursive ensembles?
由于我想对模型时间框架之外的预测数据进行一些可视化和分析,我需要提取置信度值、拟合值,也许还有残差。
文档指出,我需要使用函数 modeltime_calibrate() 来获取置信度值和残差。所以一个问题是,我从哪里提取拟合值?
我的主要问题是,如何对递归系综进行校准。对于任何非集成模型,我都能做到,但在递归集成的情况下,如果我想校准,我会遇到一些错误消息。
为了说明问题,请看下面的示例代码,它最终无法校准所有模型:
library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)
FORECAST_HORIZON <- 24
m4_extended <- m4_monthly %>%
group_by(id) %>%
future_frame(
.length_out = FORECAST_HORIZON,
.bind_data = TRUE
) %>%
ungroup()
lag_transformer_grouped <- function(data){
data %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m4_lags <- m4_extended %>%
lag_transformer_grouped()
test_data <- m4_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m4_lags %>%
drop_na()
future_data <- m4_lags %>%
filter(is.na(value))
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data)
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data)
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6)) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
model_tbl <- modeltime_table(
recursive_ensemble_panel
)
calibrated_mod <- model_tbl %>%
modeltime_calibrate(test_data, id = "id", quiet = FALSE)
model_tbl %>%
modeltime_forecast(
new_data = future_data,
actual_data = m4_lags,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.interactive = FALSE,
.conf_interval_show = TRUE,
.facet_ncol = 2
)
问题出在你的recursive_ensemble_panel
。您必须对模型本身而不是整体执行递归部分。像你一样,我希望一次性完成递归,也许通过 modeltime_table
.
# start of changes to your code.
# added recursive to the model
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# added recursive to the model
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# removed recursive part
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6))
# rest of your code
我不得不做一些实验来找到提取我需要的东西的正确方法(置信区间和残差)。
正如您从下面的示例代码中看到的那样,需要更改模型工作流程才能实现此目的。递归需要出现在工作流对象定义中,既不在模型中也不在整体中 fit/specification。
我仍然需要在这里做一些测试,但我想,我得到了我需要知道的信息:
# Time Series ML
library(tidymodels)
library(modeltime)
library(modeltime.ensemble)
# Core
library(tidyverse)
library(timetk)
# data def
FORECAST_HORIZON <- 24
lag_transformer_grouped <- function(m750){
m750 %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m750_lags <- m750 %>%
lag_transformer_grouped()
test_data <- m750_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m750_lags %>%
drop_na()
future_data <- m750_lags %>%
filter(is.na(value))
# rec
recipe_spec <- recipe(value ~ date, train_data) %>%
step_timeseries_signature(date) %>%
step_rm(matches("(.iso$)|(.xts$)")) %>%
step_normalize(matches("(index.num$)|(_year$)")) %>%
step_dummy(all_nominal()) %>%
step_fourier(date, K = 1, period = 12)
recipe_spec %>% prep() %>% juice()
# elnet
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet")
wflw_fit_glmnet <- workflow() %>%
add_model(model_fit_glmnet) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# xgboost
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost")
wflw_fit_xgboost <- workflow() %>%
add_model(model_fit_xgboost) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# mtbl
m750_models <- modeltime_table(
wflw_fit_xgboost,
wflw_fit_glmnet
)
# mfit
ensemble_fit <- m750_models %>%
ensemble_average(type = "mean")
# mcalib
calibration_tbl <- modeltime_table(
ensemble_fit
) %>%
modeltime_calibrate(test_data)
# residuals
calib_out <- calibration_tbl$.calibration_data[[1]] %>%
left_join(test_data %>% select(id, date, value))
# Forecast ex post
ex_post_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = test_data,
actual_data = m750
)
# Forecast ex ante
data_prepared_tbl <- bind_rows(train_data, test_data)
future_tbl <- data_prepared_tbl %>%
group_by(id) %>%
future_frame(.length_out = "2 years") %>%
ungroup()
ex_ante_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = future_tbl,
actual_data = m750
)
由于我想对模型时间框架之外的预测数据进行一些可视化和分析,我需要提取置信度值、拟合值,也许还有残差。
文档指出,我需要使用函数 modeltime_calibrate() 来获取置信度值和残差。所以一个问题是,我从哪里提取拟合值?
我的主要问题是,如何对递归系综进行校准。对于任何非集成模型,我都能做到,但在递归集成的情况下,如果我想校准,我会遇到一些错误消息。
为了说明问题,请看下面的示例代码,它最终无法校准所有模型:
library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)
FORECAST_HORIZON <- 24
m4_extended <- m4_monthly %>%
group_by(id) %>%
future_frame(
.length_out = FORECAST_HORIZON,
.bind_data = TRUE
) %>%
ungroup()
lag_transformer_grouped <- function(data){
data %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m4_lags <- m4_extended %>%
lag_transformer_grouped()
test_data <- m4_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m4_lags %>%
drop_na()
future_data <- m4_lags %>%
filter(is.na(value))
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data)
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data)
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6)) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
model_tbl <- modeltime_table(
recursive_ensemble_panel
)
calibrated_mod <- model_tbl %>%
modeltime_calibrate(test_data, id = "id", quiet = FALSE)
model_tbl %>%
modeltime_forecast(
new_data = future_data,
actual_data = m4_lags,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.interactive = FALSE,
.conf_interval_show = TRUE,
.facet_ncol = 2
)
问题出在你的recursive_ensemble_panel
。您必须对模型本身而不是整体执行递归部分。像你一样,我希望一次性完成递归,也许通过 modeltime_table
.
# start of changes to your code.
# added recursive to the model
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# added recursive to the model
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# removed recursive part
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6))
# rest of your code
我不得不做一些实验来找到提取我需要的东西的正确方法(置信区间和残差)。
正如您从下面的示例代码中看到的那样,需要更改模型工作流程才能实现此目的。递归需要出现在工作流对象定义中,既不在模型中也不在整体中 fit/specification。
我仍然需要在这里做一些测试,但我想,我得到了我需要知道的信息:
# Time Series ML
library(tidymodels)
library(modeltime)
library(modeltime.ensemble)
# Core
library(tidyverse)
library(timetk)
# data def
FORECAST_HORIZON <- 24
lag_transformer_grouped <- function(m750){
m750 %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m750_lags <- m750 %>%
lag_transformer_grouped()
test_data <- m750_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m750_lags %>%
drop_na()
future_data <- m750_lags %>%
filter(is.na(value))
# rec
recipe_spec <- recipe(value ~ date, train_data) %>%
step_timeseries_signature(date) %>%
step_rm(matches("(.iso$)|(.xts$)")) %>%
step_normalize(matches("(index.num$)|(_year$)")) %>%
step_dummy(all_nominal()) %>%
step_fourier(date, K = 1, period = 12)
recipe_spec %>% prep() %>% juice()
# elnet
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet")
wflw_fit_glmnet <- workflow() %>%
add_model(model_fit_glmnet) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# xgboost
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost")
wflw_fit_xgboost <- workflow() %>%
add_model(model_fit_xgboost) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# mtbl
m750_models <- modeltime_table(
wflw_fit_xgboost,
wflw_fit_glmnet
)
# mfit
ensemble_fit <- m750_models %>%
ensemble_average(type = "mean")
# mcalib
calibration_tbl <- modeltime_table(
ensemble_fit
) %>%
modeltime_calibrate(test_data)
# residuals
calib_out <- calibration_tbl$.calibration_data[[1]] %>%
left_join(test_data %>% select(id, date, value))
# Forecast ex post
ex_post_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = test_data,
actual_data = m750
)
# Forecast ex ante
data_prepared_tbl <- bind_rows(train_data, test_data)
future_tbl <- data_prepared_tbl %>%
group_by(id) %>%
future_frame(.length_out = "2 years") %>%
ungroup()
ex_ante_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = future_tbl,
actual_data = m750
)