使用 add_formula() 或 add_variables() 但不使用 add_recipe() 的 Tidymodels 工作流
Tidymodels Workflow working with add_formula() or add_variables() but not with add_recipe()
我遇到了一些奇怪的行为,使用配方和工作流使用 naiveBayes 分类器将垃圾邮件与有效文本区分开来。我试图使用 tidymodels 和工作流复制结果机器学习与 R 书的第 4 章:https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/blob/master/Chapter%2004/MLwR_v2_04.r
虽然我能够使用 add_variables()
或 add_formula()
或没有工作流程来重现分析,但使用 add_recipe()
函数的工作流程不起作用。
library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(SnowballC)
library(discrim)
sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)
set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)
# Text preprocessing
reci_sms <-
recipe(type ~.,
data = nb_train_sms) %>%
step_mutate(text = str_to_lower(text)) %>%
step_mutate(text = removeNumbers(text)) %>%
step_mutate(text = removePunctuation(text)) %>%
step_tokenize(text) %>%
step_stopwords(text, custom_stopword_source = stopwords()) %>%
step_stem(text) %>%
step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>%
step_tf(text, weight_scheme = "binary") %>%
step_mutate_at(contains("tf"), fn =function(x){ifelse(x == TRUE, "Yes", "No")}) %>%
prep()
df_training <- juice(reci_sms)
df_testing <- bake(reci_sms, new_data = nb_test_sms)
nb_model <- naive_Bayes() %>%
set_engine("klaR")
以下是实际产生有效输出的三个代码示例
# --------- works but slow -----
nb_fit <- nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_formula(type~.) %>%
fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
# --------- works -----
nb_fit <- nb_model %>% fit(type ~., df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
# --------- works -----
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_variables(outcomes = type, predictors = everything()) %>%
fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
虽然下面的代码不起作用
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_recipe(reci_sms) %>%
fit(data = df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
它也抛出以下错误,但我不太明白使用rlang::last_error()
时发生了什么
Not all variables in the recipe are present in the supplied training set: 'text'.
Run `rlang::last_error()` to see where the error occurred.
谁能告诉我我错过了什么?
当您在工作流程中使用配方时,您会将预处理步骤与模型拟合结合起来。在拟合该工作流程时,您需要使用配方预期的数据 (nb_train_sms
) 而不是防风草模型预期的数据。
此外,它是 not recommended to pass a prepped recipe to a workflow,因此在使用 add_recipe()
.
将其添加到工作流之前,看看我们如何不 prep()
library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(discrim)
sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)
set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)
# Text preprocessing
reci_sms <-
recipe(type ~.,
data = nb_train_sms) %>%
step_mutate(text = str_to_lower(text)) %>%
step_mutate(text = removeNumbers(text)) %>%
step_mutate(text = removePunctuation(text)) %>%
step_tokenize(text) %>%
step_stopwords(text, custom_stopword_source = stopwords()) %>%
step_stem(text) %>%
step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>%
step_tf(text, weight_scheme = "binary") %>%
step_mutate_at(contains("tf"), fn = function(x){ifelse(x == TRUE, "Yes", "No")})
nb_model <- naive_Bayes() %>%
set_engine("klaR")
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_recipe(reci_sms) %>%
fit(data = nb_train_sms)
#> Warning: max_features was set to '1500', but only 1141 was available and
#> selected.
nb_tidy_pred <- nb_fit %>% predict(nb_train_sms)
由 reprex package (v1.0.0)
于 2021-04-19 创建
我遇到了一些奇怪的行为,使用配方和工作流使用 naiveBayes 分类器将垃圾邮件与有效文本区分开来。我试图使用 tidymodels 和工作流复制结果机器学习与 R 书的第 4 章:https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/blob/master/Chapter%2004/MLwR_v2_04.r
虽然我能够使用 add_variables()
或 add_formula()
或没有工作流程来重现分析,但使用 add_recipe()
函数的工作流程不起作用。
library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(SnowballC)
library(discrim)
sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)
set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)
# Text preprocessing
reci_sms <-
recipe(type ~.,
data = nb_train_sms) %>%
step_mutate(text = str_to_lower(text)) %>%
step_mutate(text = removeNumbers(text)) %>%
step_mutate(text = removePunctuation(text)) %>%
step_tokenize(text) %>%
step_stopwords(text, custom_stopword_source = stopwords()) %>%
step_stem(text) %>%
step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>%
step_tf(text, weight_scheme = "binary") %>%
step_mutate_at(contains("tf"), fn =function(x){ifelse(x == TRUE, "Yes", "No")}) %>%
prep()
df_training <- juice(reci_sms)
df_testing <- bake(reci_sms, new_data = nb_test_sms)
nb_model <- naive_Bayes() %>%
set_engine("klaR")
以下是实际产生有效输出的三个代码示例
# --------- works but slow -----
nb_fit <- nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_formula(type~.) %>%
fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
# --------- works -----
nb_fit <- nb_model %>% fit(type ~., df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
# --------- works -----
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_variables(outcomes = type, predictors = everything()) %>%
fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
虽然下面的代码不起作用
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_recipe(reci_sms) %>%
fit(data = df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
它也抛出以下错误,但我不太明白使用rlang::last_error()
Not all variables in the recipe are present in the supplied training set: 'text'.
Run `rlang::last_error()` to see where the error occurred.
谁能告诉我我错过了什么?
当您在工作流程中使用配方时,您会将预处理步骤与模型拟合结合起来。在拟合该工作流程时,您需要使用配方预期的数据 (nb_train_sms
) 而不是防风草模型预期的数据。
此外,它是 not recommended to pass a prepped recipe to a workflow,因此在使用 add_recipe()
.
prep()
library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(discrim)
sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)
set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)
# Text preprocessing
reci_sms <-
recipe(type ~.,
data = nb_train_sms) %>%
step_mutate(text = str_to_lower(text)) %>%
step_mutate(text = removeNumbers(text)) %>%
step_mutate(text = removePunctuation(text)) %>%
step_tokenize(text) %>%
step_stopwords(text, custom_stopword_source = stopwords()) %>%
step_stem(text) %>%
step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>%
step_tf(text, weight_scheme = "binary") %>%
step_mutate_at(contains("tf"), fn = function(x){ifelse(x == TRUE, "Yes", "No")})
nb_model <- naive_Bayes() %>%
set_engine("klaR")
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_recipe(reci_sms) %>%
fit(data = nb_train_sms)
#> Warning: max_features was set to '1500', but only 1141 was available and
#> selected.
nb_tidy_pred <- nb_fit %>% predict(nb_train_sms)
由 reprex package (v1.0.0)
于 2021-04-19 创建