在 R 中构建预测模型时,使用 step_normalize 函数 (tidymodels) 是否需要以相同方式处理测试数据?
In building a predictive model in R, does using step_normalize function (tidymodels) require test data to be processed in the same way?
如果这是问这个问题的错误地方,我很抱歉。我对构建 SVM 的配方中所有预测变量的 tidymodels functionality. If I have a dataset (in the below example ion_train
) and I apply the function step_normalize 的一部分感到困惑,它将标准化数字数据,使其标准差为 1,平均值为 0。这是否意味着当我将 SVM 应用于测试数据集时(在下面的示例 ion_test
中),我首先需要在使用predict()
函数(在下面代码的底部)?
library(tidymodels)
library(mlbench)
data(Ionosphere)
# preprocess dataset
Ionosphere <- Ionosphere %>% select(-V1, -V2)
# split into training and test data
ion_split <- initial_split(Ionosphere, prop = 3/5)
ion_train <- training(ion_split)
ion_test <- testing(ion_split)
# make a recipe
iono_rec <-
recipe(Class ~ ., data = ion_train) %>%
step_normalize(all_predictors())
# build the model and workflow
svm_mod <-
svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_mode("classification") %>%
set_engine("kernlab")
svm_workflow <-
workflow() %>%
add_recipe(iono_rec) %>%
add_model(svm_mod)
# run model tuning
set.seed(35)
recipe_res <-
svm_workflow %>%
tune_grid(
resamples = bootstraps(ion_train, times = 2),
metrics = metric_set(roc_auc),
control = control_grid(verbose = TRUE, save_pred = TRUE)
)
# chose best model, finalise workflow
best_mod <- recipe_res %>% select_best("roc_auc")
final_wf <- finalize_workflow(svm_workflow, best_mod)
final_mod <- final_wf %>% fit(ion_train)
predict_res <- predict(
final_mod,
ion_test,
type = "prob")
您可能想通读 this chapter on using recipes for data preprocessing and feature engineering。 recipe 的想法是从训练集中估计统计数据,然后将 same 预处理应用于任何其他数据,如测试集或预测时的新数据。
让我们来看看。
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(mlbench)
data(Ionosphere)
# preprocess dataset
Ionosphere <- Ionosphere %>% select(-V1, -V2)
# split into training and test data
ion_split <- initial_split(Ionosphere, prop = 3/5)
ion_train <- training(ion_split)
ion_test <- testing(ion_split)
# make a recipe
iono_rec <-
recipe(Class ~ ., data = ion_train) %>%
step_normalize(all_predictors())
函数prep()
是calculates/estimates训练集中的统计数据。您可以使用 bake()
得出结果;当您使用 new_data = NULL
时,您会得到训练数据的结果,即用于估计的原始数据,在本例中为均值和标准差。
iono_rec %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 211 x 33
#> V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.698 -0.163 0.452 -0.227 0.527 -0.912 0.908 -0.298 0.587 -0.700
#> 2 0.708 -0.477 0.604 -1.10 -1.41 -2.00 0.908 -0.477 -0.0234 -1.74
#> 3 0.708 -0.102 0.739 -0.268 0.869 -0.414 0.676 -0.353 0.371 -0.221
#> 4 0.708 -1.12 0.739 1.99 0.276 -2.12 -1.20 -0.379 -0.927 -0.332
#> 5 -1.41 -0.0342 -1.40 -0.551 -1.21 -0.410 -0.890 -0.236 -0.859 -0.463
#> 6 0.656 -0.277 0.634 -0.753 0.721 -0.730 0.613 -0.968 0.489 -1.33
#> 7 -1.46 -0.0198 -1.21 -0.279 0.869 -2.12 -1.20 -0.379 -2.70 -2.41
#> 8 0.708 -1.34 0.739 -2.55 0.869 -2.12 0.908 0.401 0.849 -1.18
#> 9 0.708 0.159 0.739 -0.202 0.869 -0.288 0.908 -0.190 0.849 0.0754
#> 10 -0.356 -2.30 0.739 0.328 -1.26 -2.12 0.908 -2.53 -0.151 -2.41
#> # … with 201 more rows, and 23 more variables: V13 <dbl>, V14 <dbl>, V15 <dbl>,
#> # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
#> # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
#> # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>,
#> # V34 <dbl>, Class <fct>
不过,您可以 bake()
其他数据,例如测试数据或新数据。这将训练集中的均值和标准差(在本例中为step_normalize()
)应用于这些其他数据集。这是有目的的,以再次防止数据泄漏。您想将训练集中的统计信息应用到您的其他数据,例如新数据或测试数据。
iono_rec %>%
prep() %>%
bake(new_data = ion_test)
#> # A tibble: 140 x 33
#> V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.708 -0.0781 0.625 -0.131 0.706 -0.632 0.427 -0.732 0.0107 -0.753
#> 2 0.629 -0.195 0.739 -0.605 0.869 -0.594 0.908 -1.16 0.717 -1.24
#> 3 -1.50 -0.225 -1.21 -0.279 -1.19 -0.180 -0.958 -0.956 -1.74 -1.12
#> 4 0.708 0.142 0.739 -0.698 0.869 -0.710 0.908 -1.31 0.849 -1.19
#> 5 0.708 -0.416 0.739 -0.511 0.869 -0.475 0.908 -0.794 0.743 -1.06
#> 6 0.708 -2.13 0.739 0.227 0.570 -0.955 0.908 -0.639 0.849 0.397
#> 7 -1.46 -0.0198 -3.16 -2.55 0.869 1.76 -3.31 1.77 -2.70 1.74
#> 8 0.708 2.41 -1.21 -0.279 -1.19 -0.180 -3.31 -2.53 -0.927 -0.332
#> 9 0.708 -0.231 0.739 -0.672 0.594 -1.77 0.799 0.936 0.768 -1.19
#> 10 0.708 0.184 0.739 0.116 0.869 -0.438 0.870 1.01 0.849 0.661
#> # … with 130 more rows, and 23 more variables: V13 <dbl>, V14 <dbl>, V15 <dbl>,
#> # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
#> # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
#> # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>,
#> # V34 <dbl>, Class <fct>
由 reprex package (v2.0.0)
于 2021-04-14 创建
现在您的示例显示了将您的食谱放入工作流程中。当您使用 workflow()
时,会有高级函数自动处理这些阶段。您不必手动使用 prep()
和 bake()
。您可以阅读有关 details of using a model workflow()
here.
的更多信息
如果这是问这个问题的错误地方,我很抱歉。我对构建 SVM 的配方中所有预测变量的 tidymodels functionality. If I have a dataset (in the below example ion_train
) and I apply the function step_normalize 的一部分感到困惑,它将标准化数字数据,使其标准差为 1,平均值为 0。这是否意味着当我将 SVM 应用于测试数据集时(在下面的示例 ion_test
中),我首先需要在使用predict()
函数(在下面代码的底部)?
library(tidymodels)
library(mlbench)
data(Ionosphere)
# preprocess dataset
Ionosphere <- Ionosphere %>% select(-V1, -V2)
# split into training and test data
ion_split <- initial_split(Ionosphere, prop = 3/5)
ion_train <- training(ion_split)
ion_test <- testing(ion_split)
# make a recipe
iono_rec <-
recipe(Class ~ ., data = ion_train) %>%
step_normalize(all_predictors())
# build the model and workflow
svm_mod <-
svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_mode("classification") %>%
set_engine("kernlab")
svm_workflow <-
workflow() %>%
add_recipe(iono_rec) %>%
add_model(svm_mod)
# run model tuning
set.seed(35)
recipe_res <-
svm_workflow %>%
tune_grid(
resamples = bootstraps(ion_train, times = 2),
metrics = metric_set(roc_auc),
control = control_grid(verbose = TRUE, save_pred = TRUE)
)
# chose best model, finalise workflow
best_mod <- recipe_res %>% select_best("roc_auc")
final_wf <- finalize_workflow(svm_workflow, best_mod)
final_mod <- final_wf %>% fit(ion_train)
predict_res <- predict(
final_mod,
ion_test,
type = "prob")
您可能想通读 this chapter on using recipes for data preprocessing and feature engineering。 recipe 的想法是从训练集中估计统计数据,然后将 same 预处理应用于任何其他数据,如测试集或预测时的新数据。
让我们来看看。
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(mlbench)
data(Ionosphere)
# preprocess dataset
Ionosphere <- Ionosphere %>% select(-V1, -V2)
# split into training and test data
ion_split <- initial_split(Ionosphere, prop = 3/5)
ion_train <- training(ion_split)
ion_test <- testing(ion_split)
# make a recipe
iono_rec <-
recipe(Class ~ ., data = ion_train) %>%
step_normalize(all_predictors())
函数prep()
是calculates/estimates训练集中的统计数据。您可以使用 bake()
得出结果;当您使用 new_data = NULL
时,您会得到训练数据的结果,即用于估计的原始数据,在本例中为均值和标准差。
iono_rec %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 211 x 33
#> V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.698 -0.163 0.452 -0.227 0.527 -0.912 0.908 -0.298 0.587 -0.700
#> 2 0.708 -0.477 0.604 -1.10 -1.41 -2.00 0.908 -0.477 -0.0234 -1.74
#> 3 0.708 -0.102 0.739 -0.268 0.869 -0.414 0.676 -0.353 0.371 -0.221
#> 4 0.708 -1.12 0.739 1.99 0.276 -2.12 -1.20 -0.379 -0.927 -0.332
#> 5 -1.41 -0.0342 -1.40 -0.551 -1.21 -0.410 -0.890 -0.236 -0.859 -0.463
#> 6 0.656 -0.277 0.634 -0.753 0.721 -0.730 0.613 -0.968 0.489 -1.33
#> 7 -1.46 -0.0198 -1.21 -0.279 0.869 -2.12 -1.20 -0.379 -2.70 -2.41
#> 8 0.708 -1.34 0.739 -2.55 0.869 -2.12 0.908 0.401 0.849 -1.18
#> 9 0.708 0.159 0.739 -0.202 0.869 -0.288 0.908 -0.190 0.849 0.0754
#> 10 -0.356 -2.30 0.739 0.328 -1.26 -2.12 0.908 -2.53 -0.151 -2.41
#> # … with 201 more rows, and 23 more variables: V13 <dbl>, V14 <dbl>, V15 <dbl>,
#> # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
#> # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
#> # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>,
#> # V34 <dbl>, Class <fct>
不过,您可以 bake()
其他数据,例如测试数据或新数据。这将训练集中的均值和标准差(在本例中为step_normalize()
)应用于这些其他数据集。这是有目的的,以再次防止数据泄漏。您想将训练集中的统计信息应用到您的其他数据,例如新数据或测试数据。
iono_rec %>%
prep() %>%
bake(new_data = ion_test)
#> # A tibble: 140 x 33
#> V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.708 -0.0781 0.625 -0.131 0.706 -0.632 0.427 -0.732 0.0107 -0.753
#> 2 0.629 -0.195 0.739 -0.605 0.869 -0.594 0.908 -1.16 0.717 -1.24
#> 3 -1.50 -0.225 -1.21 -0.279 -1.19 -0.180 -0.958 -0.956 -1.74 -1.12
#> 4 0.708 0.142 0.739 -0.698 0.869 -0.710 0.908 -1.31 0.849 -1.19
#> 5 0.708 -0.416 0.739 -0.511 0.869 -0.475 0.908 -0.794 0.743 -1.06
#> 6 0.708 -2.13 0.739 0.227 0.570 -0.955 0.908 -0.639 0.849 0.397
#> 7 -1.46 -0.0198 -3.16 -2.55 0.869 1.76 -3.31 1.77 -2.70 1.74
#> 8 0.708 2.41 -1.21 -0.279 -1.19 -0.180 -3.31 -2.53 -0.927 -0.332
#> 9 0.708 -0.231 0.739 -0.672 0.594 -1.77 0.799 0.936 0.768 -1.19
#> 10 0.708 0.184 0.739 0.116 0.869 -0.438 0.870 1.01 0.849 0.661
#> # … with 130 more rows, and 23 more variables: V13 <dbl>, V14 <dbl>, V15 <dbl>,
#> # V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>, V21 <dbl>,
#> # V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>,
#> # V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>, V33 <dbl>,
#> # V34 <dbl>, Class <fct>
由 reprex package (v2.0.0)
于 2021-04-14 创建现在您的示例显示了将您的食谱放入工作流程中。当您使用 workflow()
时,会有高级函数自动处理这些阶段。您不必手动使用 prep()
和 bake()
。您可以阅读有关 details of using a model workflow()
here.