在 R 中使用线性模型从配方创建工作流时出错
Error while creating workflow from a recipie using linear models in R
我正在训练一个线性回归模型,该模型使用 Whosebug 数据根据公司规模 (company_size_number
) 和国家 (country
) 预测薪水。
我做的是:
- 读取数据。将数据拆分为训练集 (75%) 和测试集 (25%)。
- 创建一个将
company_size_number
转换为因子变量然后将两个预测变量转换为虚拟变量的方法。
- 创建模型规范。
- 创建工作流对象并向其添加配方和模型规范,然后在训练集上拟合模型。
- 在测试集上计算 R²。
这是我的代码
library(tidyverse)
library(tidymodels)
so <- read_rds("Whosebug.rds")
set.seed(123)
init_split <- initial_split(so)
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_num2factor(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
但由于错误无法继续:
Error: Please provide a character vector of appropriate length for `levels`.
我想我在 spec_*()
中弄乱了一些东西
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_novel(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
但不确定这是否正确。任何输入都会有所帮助。
> dput(head(so))
structure(list(country = structure(c(5L, 5L, 4L, 4L, 5L, 5L), .Label = c("Canada",
"Germany", "India", "United Kingdom", "United States"), class = "factor"),
salary = c(63750, 93000, 40625, 45000, 1e+05, 170000), years_coded_job = c(4L,
9L, 8L, 3L, 8L, 12L), open_source = c(0, 1, 1, 1, 0, 1),
hobby = c(1, 1, 1, 0, 1, 1), company_size_number = c(20,
1000, 10000, 1, 10, 100), remote = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Remote", "Not remote"), class = "factor"),
career_satisfaction = c(8L, 8L, 5L, 10L, 8L, 10L), data_scientist = c(0,
0, 1, 0, 0, 0), database_administrator = c(1, 0, 1, 0, 0,
0), desktop_applications_developer = c(1, 0, 1, 0, 0, 0),
developer_with_stats_math_background = c(0, 0, 0, 0, 0, 0
), dev_ops = c(0, 0, 0, 0, 0, 1), embedded_developer = c(0,
0, 0, 0, 0, 0), graphic_designer = c(0, 0, 0, 0, 0, 0), graphics_programming = c(0,
0, 0, 0, 0, 0), machine_learning_specialist = c(0, 0, 0,
0, 0, 0), mobile_developer = c(0, 1, 0, 0, 1, 0), quality_assurance_engineer = c(0,
0, 0, 0, 0, 0), systems_administrator = c(1, 0, 1, 0, 0,
1), web_developer = c(0, 0, 0, 1, 1, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
我有一些关于调整您正在做的事情的建议。
- 首先是在拆分之前选择变量,这样当你使用像
salary ~ .
这样的公式时,你and/or函数就不会'对那里的内容感到困惑。
- 第二个是不要像你那样使用
step_num2factor()
;让它正常工作需要很多时间,我认为你最好在拆分之前将它转换为一个因素。为这个食谱步骤取 look at this step's documentation to see a more appropriate use,注意你必须给它 levels
。这就是您看到错误的原因,但老实说,我不会尝试找到正确的级别并在那里输入它们;我会在分裂前这样做。
library(tidyverse)
library(tidymodels)
data("Whosebug", package = "modeldata")
so <- janitor::clean_names(Whosebug)
set.seed(123)
init_split <- so %>%
select(salary, company_size_number, country) %>%
mutate(company_size_number = factor(company_size_number)) %>%
initial_split()
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 27822.
由 reprex package (v2.0.0)
于 2021-05-25 创建
我正在训练一个线性回归模型,该模型使用 Whosebug 数据根据公司规模 (company_size_number
) 和国家 (country
) 预测薪水。
我做的是:
- 读取数据。将数据拆分为训练集 (75%) 和测试集 (25%)。
- 创建一个将
company_size_number
转换为因子变量然后将两个预测变量转换为虚拟变量的方法。 - 创建模型规范。
- 创建工作流对象并向其添加配方和模型规范,然后在训练集上拟合模型。
- 在测试集上计算 R²。
这是我的代码
library(tidyverse)
library(tidymodels)
so <- read_rds("Whosebug.rds")
set.seed(123)
init_split <- initial_split(so)
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_num2factor(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
但由于错误无法继续:
Error: Please provide a character vector of appropriate length for `levels`.
我想我在 spec_*()
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_novel(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
但不确定这是否正确。任何输入都会有所帮助。
> dput(head(so))
structure(list(country = structure(c(5L, 5L, 4L, 4L, 5L, 5L), .Label = c("Canada",
"Germany", "India", "United Kingdom", "United States"), class = "factor"),
salary = c(63750, 93000, 40625, 45000, 1e+05, 170000), years_coded_job = c(4L,
9L, 8L, 3L, 8L, 12L), open_source = c(0, 1, 1, 1, 0, 1),
hobby = c(1, 1, 1, 0, 1, 1), company_size_number = c(20,
1000, 10000, 1, 10, 100), remote = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Remote", "Not remote"), class = "factor"),
career_satisfaction = c(8L, 8L, 5L, 10L, 8L, 10L), data_scientist = c(0,
0, 1, 0, 0, 0), database_administrator = c(1, 0, 1, 0, 0,
0), desktop_applications_developer = c(1, 0, 1, 0, 0, 0),
developer_with_stats_math_background = c(0, 0, 0, 0, 0, 0
), dev_ops = c(0, 0, 0, 0, 0, 1), embedded_developer = c(0,
0, 0, 0, 0, 0), graphic_designer = c(0, 0, 0, 0, 0, 0), graphics_programming = c(0,
0, 0, 0, 0, 0), machine_learning_specialist = c(0, 0, 0,
0, 0, 0), mobile_developer = c(0, 1, 0, 0, 1, 0), quality_assurance_engineer = c(0,
0, 0, 0, 0, 0), systems_administrator = c(1, 0, 1, 0, 0,
1), web_developer = c(0, 0, 0, 1, 1, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
我有一些关于调整您正在做的事情的建议。
- 首先是在拆分之前选择变量,这样当你使用像
salary ~ .
这样的公式时,你and/or函数就不会'对那里的内容感到困惑。 - 第二个是不要像你那样使用
step_num2factor()
;让它正常工作需要很多时间,我认为你最好在拆分之前将它转换为一个因素。为这个食谱步骤取 look at this step's documentation to see a more appropriate use,注意你必须给它levels
。这就是您看到错误的原因,但老实说,我不会尝试找到正确的级别并在那里输入它们;我会在分裂前这样做。
library(tidyverse)
library(tidymodels)
data("Whosebug", package = "modeldata")
so <- janitor::clean_names(Whosebug)
set.seed(123)
init_split <- so %>%
select(salary, company_size_number, country) %>%
mutate(company_size_number = factor(company_size_number)) %>%
initial_split()
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 27822.
由 reprex package (v2.0.0)
于 2021-05-25 创建