How to fix this error: Recipes fail to load in Caret:: Train?

How to fix this error: Recipes fail to load in Caret:: Train?

我在将食谱加载到 caret::train 时遇到这个问题

NA 插补有问题,但我不知道如何解决。如果我删除 cross 验证一切正常。

提前致谢,

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes())


# Validation
cv5 <- trainControl( method = "repeatedcv", 
                     number = 5,
                     repeats = 5, allowParallel = TRUE)

# Fit an lm model
set.seed(12) 
lm_fit <- train(
  air_recipe,
  data = air_train, 
  method = "lm", 
  trControl = cv5, 
  metric = "RMSE")

错误信息

quantile.default(y, probs = seq(0, 1, length = cuts)) 中的错误:如果 'na.rm' 为假,则不允许缺失值和 NaN

R.version _
平台x86_64-apple-darwin15.6.0
拱门 x86_64
osdarwin15.6.0
系统x86_64, darwin15.6.0
状态
专业 3
未成年人 6.1
2019 年
07 月
第 5 天
svn 版本 76782
语言 R
version.string R 版本 3.6.1 (2019-07-05) 昵称脚趾动作

看起来重采样已经完成

因此您可以 prepjuice recipe 并使用公式方法:

library(recipes)
library(caret)
library(rsample)

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_naomit(all_outcomes(),all_predictors())

# Prep recipe
air_prep <- prep(air_recipe, retain = TRUE)

# Juice the prepared recipe 
air_train <- juice(air_prep)

# Validation
cv5 <- trainControl( method = "repeatedcv", 
                     number = 5,
                     repeats = 5, allowParallel = TRUE)


# Fit an lm model
set.seed(12) 
lm_fit <- train(
  Ozone ~ .,
  data = air_train, 
  method = "lm", 
  trControl = cv5, 
  metric = "RMSE")

lm_fit
#> Linear Regression 
#> 
#> 108 samples
#>   5 predictor
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold, repeated 5 times) 
#> Summary of sample sizes: 86, 88, 86, 86, 86, 86, ... 
#> Resampling results:
#> 
#>   RMSE       Rsquared   MAE      
#>   0.5091496  0.6568485  0.3793589
#> 
#> Tuning parameter 'intercept' was held constant at a value of TRUE

或者,您可以使用 {parsnip}{tune} 将所有内容保留在 tidymodels 习语中:

library(recipes)
library(rsample)
library(parsnip)
library(tune)
library(yardstick)

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_naomit(all_outcomes(),all_predictors())

air_cv <- vfold_cv(air_train, v = 5, repeats = 5)

lm_mod <- linear_reg() %>% set_engine("lm")

lm_fits <- fit_resamples(air_recipe, lm_mod, air_cv)

show_best(lm_fits, metric = "rmse", maximize = FALSE)
#> # A tibble: 1 x 5
#>   .metric .estimator  mean     n std_err
#>   <chr>   <chr>      <dbl> <int>   <dbl>
#> 1 rmse    standard   0.526    25  0.0256

reprex package (v0.3.0)

于 2020-04-05 创建