在每个组上拟合一个模型,并使用不在该组中的所有行的数据对其进行评估,R
Fit a model on each group and evaluate it using data from all rows not in this group, R
我想使用数据框中每个组的数据来拟合模型。然后我想使用这个模型来预测数据帧中不在组中的所有数据,并计算一个像 RMSE 这样的指标。我有一些问题需要思考如何在不执行许多手动步骤的情况下实现这样的目标。我有以下使用 mtcars
数据的玩具示例。
我想为每组 cyl
拟合模型 lm(mpg ~ wt, data=mtcars)
,并使用该模型和剩余点的数据来计算像 RMSE 这样的值。
我写了下面的代码,但是没有用,感觉也不是很好。我很乐意听到任何提示和技巧:)
library(tidyverse)
# 'global' model
lm(mpg ~ wt, data=mtcars)
# 1. fit one model for each class of cyl,
# 2. use it to predict the remaining ones,
# 3. get the RMSE for each class of cyl
res = mtcars %>%
group_by(cyl) %>%
mutate(
models = list(lm(mpg ~ wt, data = cur_data())), # why does this needs to be a list?
ref_data = list(mtcars %>% filter(!cyl %in% cur_data()$cyl[[1]])), # get all the data minus the current group at put it in a list column
predict = map(models, ~predict(.x, newdata=mtcars %>% filter(!cyl %in% cur_data()$cyl[[1]]))), # predict it on all others -> will store one df per row...
rmse = map2_dbl(predict, ref_data, ~sqrt((sum(.y - .y)^2))/length(predict))
)
这是一个基于tidyverse
的解决方案。
library(tidyverse)
res = mtcars %>%
group_by(cyl) %>%
summarize(
data1 = list(cur_data_all()),
data2 = list(subset(mtcars, cyl!=unique(data1[[1]]$cyl) )),
models = list( lm(mpg ~ wt, data=data1[[1]]) ),
pred = list(predict(models[[1]], newdata=data2[[1]]) ),
rmse = sqrt(mean((data2[[1]]$mpg - pred[[1]])^2))
)
print(res)
# A tibble: 3 x 6
cyl data1 data2 models pred rmse2
<dbl> <list> <list> <list> <list> <dbl>
1 4 <tibble [11 x 11]> <df [21 x 11]> <lm> <dbl [21]> 3.35
2 6 <tibble [7 x 11]> <df [25 x 11]> <lm> <dbl [25]> 4.38
3 8 <tibble [14 x 11]> <df [18 x 11]> <lm> <dbl [18]> 6.94
这里有两个基本的 R 解决方案,一个带有 for
循环,另一个带有 sapply
循环。
rmse <- numeric(length(unique(mtcars$cyl)))
ucyl <- unique(mtcars$cyl)
for(i in seq_along(ucyl)){
cc <- ucyl[i]
fit <- lm(mpg ~ wt, data = mtcars, subset = cyl == cc)
new_wt <- mtcars$wt[mtcars$cyl != cc]
new_mpg <- mtcars$mpg[mtcars$cyl != cc]
ypred <- predict(fit, newdata = data.frame(wt = new_wt))
rmse[i] <- sqrt(mean((new_mpg - ypred)^2))
}
rmse
#[1] 4.378324 3.354043 6.942228
rmse2 <- sapply(seq_along(ucyl), function(i){
cc <- ucyl[i]
fit <- lm(mpg ~ wt, data = mtcars, subset = cyl == cc)
#
new_wt <- mtcars$wt[mtcars$cyl != cc]
new_mpg <- mtcars$mpg[mtcars$cyl != cc]
#
ypred <- predict(fit, newdata = data.frame(wt = new_wt))
sqrt(mean((new_mpg - ypred)^2))
})
identical(rmse, rmse2)
#[1] TRUE
您可以考虑为此使用 tidymodels 函数,因为这种情况正是 rsample::rsplit
对象的用途。
例如这里,“分析”集有 cyl == 6
,“评估”集有 cyl
等于其他值:
library(rsample)
ind <- list(analysis = which(mtcars$cyl == 6),
assessment = which(mtcars$cyl != 6))
make_splits(ind, mtcars)
#> <Analysis/Assess/Total>
#> <7/25/32>
由 reprex package (v2.0.1)
于 2021-08-11 创建
要进行建模分析,您将创建一个函数来根据您的参数(此处为 cyl
)生成 split
,然后使用 purrr::map()
映射值该参数和:
- 劈叉
- 将模型拟合到每个拆分
- 在每次拆分时使用每个模型进行预测(注意您在 评估 集上进行预测)
- 计算 RMSE
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
manual_split_from_cyl <- function(cyl_value) {
ind <- list(analysis = which(mtcars$cyl == cyl_value),
assessment = which(mtcars$cyl != cyl_value))
make_splits(ind, mtcars)
}
tibble(cyl = unique(mtcars$cyl)) %>%
mutate(splits = map(cyl, manual_split_from_cyl),
model = map(splits, ~ lm(mpg ~ wt, data = analysis(.))),
preds = map2(model, splits, ~ predict(.x, newdata = assessment(.y))),
rmse = map2_dbl(splits, preds, ~ rmse_vec(assessment(.x)$mpg, .y)))
#> # A tibble: 3 × 5
#> cyl splits model preds rmse
#> <dbl> <list> <list> <list> <dbl>
#> 1 6 <split [7/25]> <lm> <dbl [25]> 4.38
#> 2 4 <split [11/21]> <lm> <dbl [21]> 3.35
#> 3 8 <split [14/18]> <lm> <dbl [18]> 6.94
由 reprex package (v2.0.1)
于 2021-08-11 创建
我想使用数据框中每个组的数据来拟合模型。然后我想使用这个模型来预测数据帧中不在组中的所有数据,并计算一个像 RMSE 这样的指标。我有一些问题需要思考如何在不执行许多手动步骤的情况下实现这样的目标。我有以下使用 mtcars
数据的玩具示例。
我想为每组 cyl
拟合模型 lm(mpg ~ wt, data=mtcars)
,并使用该模型和剩余点的数据来计算像 RMSE 这样的值。
我写了下面的代码,但是没有用,感觉也不是很好。我很乐意听到任何提示和技巧:)
library(tidyverse)
# 'global' model
lm(mpg ~ wt, data=mtcars)
# 1. fit one model for each class of cyl,
# 2. use it to predict the remaining ones,
# 3. get the RMSE for each class of cyl
res = mtcars %>%
group_by(cyl) %>%
mutate(
models = list(lm(mpg ~ wt, data = cur_data())), # why does this needs to be a list?
ref_data = list(mtcars %>% filter(!cyl %in% cur_data()$cyl[[1]])), # get all the data minus the current group at put it in a list column
predict = map(models, ~predict(.x, newdata=mtcars %>% filter(!cyl %in% cur_data()$cyl[[1]]))), # predict it on all others -> will store one df per row...
rmse = map2_dbl(predict, ref_data, ~sqrt((sum(.y - .y)^2))/length(predict))
)
这是一个基于tidyverse
的解决方案。
library(tidyverse)
res = mtcars %>%
group_by(cyl) %>%
summarize(
data1 = list(cur_data_all()),
data2 = list(subset(mtcars, cyl!=unique(data1[[1]]$cyl) )),
models = list( lm(mpg ~ wt, data=data1[[1]]) ),
pred = list(predict(models[[1]], newdata=data2[[1]]) ),
rmse = sqrt(mean((data2[[1]]$mpg - pred[[1]])^2))
)
print(res)
# A tibble: 3 x 6
cyl data1 data2 models pred rmse2
<dbl> <list> <list> <list> <list> <dbl>
1 4 <tibble [11 x 11]> <df [21 x 11]> <lm> <dbl [21]> 3.35
2 6 <tibble [7 x 11]> <df [25 x 11]> <lm> <dbl [25]> 4.38
3 8 <tibble [14 x 11]> <df [18 x 11]> <lm> <dbl [18]> 6.94
这里有两个基本的 R 解决方案,一个带有 for
循环,另一个带有 sapply
循环。
rmse <- numeric(length(unique(mtcars$cyl)))
ucyl <- unique(mtcars$cyl)
for(i in seq_along(ucyl)){
cc <- ucyl[i]
fit <- lm(mpg ~ wt, data = mtcars, subset = cyl == cc)
new_wt <- mtcars$wt[mtcars$cyl != cc]
new_mpg <- mtcars$mpg[mtcars$cyl != cc]
ypred <- predict(fit, newdata = data.frame(wt = new_wt))
rmse[i] <- sqrt(mean((new_mpg - ypred)^2))
}
rmse
#[1] 4.378324 3.354043 6.942228
rmse2 <- sapply(seq_along(ucyl), function(i){
cc <- ucyl[i]
fit <- lm(mpg ~ wt, data = mtcars, subset = cyl == cc)
#
new_wt <- mtcars$wt[mtcars$cyl != cc]
new_mpg <- mtcars$mpg[mtcars$cyl != cc]
#
ypred <- predict(fit, newdata = data.frame(wt = new_wt))
sqrt(mean((new_mpg - ypred)^2))
})
identical(rmse, rmse2)
#[1] TRUE
您可以考虑为此使用 tidymodels 函数,因为这种情况正是 rsample::rsplit
对象的用途。
例如这里,“分析”集有 cyl == 6
,“评估”集有 cyl
等于其他值:
library(rsample)
ind <- list(analysis = which(mtcars$cyl == 6),
assessment = which(mtcars$cyl != 6))
make_splits(ind, mtcars)
#> <Analysis/Assess/Total>
#> <7/25/32>
由 reprex package (v2.0.1)
于 2021-08-11 创建要进行建模分析,您将创建一个函数来根据您的参数(此处为 cyl
)生成 split
,然后使用 purrr::map()
映射值该参数和:
- 劈叉
- 将模型拟合到每个拆分
- 在每次拆分时使用每个模型进行预测(注意您在 评估 集上进行预测)
- 计算 RMSE
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
manual_split_from_cyl <- function(cyl_value) {
ind <- list(analysis = which(mtcars$cyl == cyl_value),
assessment = which(mtcars$cyl != cyl_value))
make_splits(ind, mtcars)
}
tibble(cyl = unique(mtcars$cyl)) %>%
mutate(splits = map(cyl, manual_split_from_cyl),
model = map(splits, ~ lm(mpg ~ wt, data = analysis(.))),
preds = map2(model, splits, ~ predict(.x, newdata = assessment(.y))),
rmse = map2_dbl(splits, preds, ~ rmse_vec(assessment(.x)$mpg, .y)))
#> # A tibble: 3 × 5
#> cyl splits model preds rmse
#> <dbl> <list> <list> <list> <dbl>
#> 1 6 <split [7/25]> <lm> <dbl [25]> 4.38
#> 2 4 <split [11/21]> <lm> <dbl [21]> 3.35
#> 3 8 <split [14/18]> <lm> <dbl [18]> 6.94
由 reprex package (v2.0.1)
于 2021-08-11 创建