使用每个预测变量列的拟合模型分别将结果存储在数据框中
Fit model using each predictor columns indiviually store results in dataframe
我有一个包含一列响应变量和几列预测变量的数据框。我想分别使用每个预测变量来拟合响应变量的模型,最后创建一个包含模型系数的数据框。以前,我会这样做:
data(iris)
iris_vars <- c("Sepal.Width", "Petal.Length", "Petal.Width")
fits.iris <- lapply(iris_vars, function(x) {lm(substitute(Sepal.Length ~ i, list(i = as.name(x))), data = iris)})
# extract model coeffs, so forth and so on, eventually combining into a result dataframe
iris.p <- as.data.frame(lapply(fits.iris, function(f) summary(f)$coefficients[,4]))
iris.r <- as.data.frame(lapply(fits.iris, function(f) summary(f)$r.squared))
然而,这似乎有点麻烦,因为我已经开始使用 dplyr
、broom
等。使用 purrr::map
我或多或少可以重新创建这个模型列表:
# using purrr, still uses the Response variable "Sepal.Length" as a predictor of itself
iris %>%
select(1:4) %>%
# names(select(., 2:4)) %>% this does not work
names() %>%
paste('Sepal.Length ~', .) %>%
map(~lm(as.formula(.x), data = iris))
但是,我不确定如何将此列表转换为适合 broom::tidy
的形式。如果我正在使用分组行而不是列,我会存储模型拟合并使用 broom::tidy
来做这样的事情:
iris.fits <- group_by(Species) %>% do(modfit1 = lm(Sepal.Length~Sepal.Width,data=.))
tidy(iris.fits, modfit1)
当然这不是我正在做的,但我希望在使用数据列时有类似的过程。有没有办法,也许使用 purrr::nest
或类似的东西来创建所需的输出?
1) 这给出了模型拟合的 glance
和 tidy
输出:
library(broom)
make_model <- function(nm) lm(iris[c("Sepal.Length", nm)])
fits <- Map(make_model, iris_vars)
glance_tidy <- function(x) c(unlist(glance(x)), unlist(tidy(x)[, -1]))
out <- sapply(fits, glance_tidy)
1a) 或作为 magrittr 管道:
library(magrittr)
out <- iris_vars %>% Map(f = make_model) %>% sapply(glance_tidy)
其中一个给出了以下矩阵:
> out
Sepal.Width Petal.Length Petal.Width
r.squared 1.382265e-02 7.599546e-01 6.690277e-01
adj.r.squared 7.159294e-03 7.583327e-01 6.667914e-01
sigma 8.250966e-01 4.070745e-01 4.779948e-01
statistic 2.074427e+00 4.685502e+02 2.991673e+02
p.value 1.518983e-01 1.038667e-47 2.325498e-37
df 2.000000e+00 2.000000e+00 2.000000e+00
logLik -1.829958e+02 -7.702021e+01 -1.011107e+02
AIC 3.719917e+02 1.600404e+02 2.082215e+02
BIC 3.810236e+02 1.690723e+02 2.172534e+02
deviance 1.007561e+02 2.452503e+01 3.381489e+01
df.residual 1.480000e+02 1.480000e+02 1.480000e+02
estimate1 6.526223e+00 4.306603e+00 4.777629e+00
estimate2 -2.233611e-01 4.089223e-01 8.885803e-01
std.error1 4.788963e-01 7.838896e-02 7.293476e-02
std.error2 1.550809e-01 1.889134e-02 5.137355e-02
statistic1 1.362763e+01 5.493890e+01 6.550552e+01
statistic2 -1.440287e+00 2.164602e+01 1.729645e+01
p.value1 6.469702e-28 2.426713e-100 3.340431e-111
p.value2 1.518983e-01 1.038667e-47 2.325498e-37
或转置:
> t(out)
r.squared adj.r.squared sigma statistic p.value df
Sepal.Width 0.01382265 0.007159294 0.8250966 2.074427 1.518983e-01 2
Petal.Length 0.75995465 0.758332718 0.4070745 468.550154 1.038667e-47 2
Petal.Width 0.66902769 0.666791387 0.4779948 299.167312 2.325498e-37 2
logLik AIC BIC deviance df.residual estimate1
Sepal.Width -182.99584 371.9917 381.0236 100.75610 148 6.526223
Petal.Length -77.02021 160.0404 169.0723 24.52503 148 4.306603
Petal.Width -101.11073 208.2215 217.2534 33.81489 148 4.777629
estimate2 std.error1 std.error2 statistic1 statistic2
Sepal.Width -0.2233611 0.47889634 0.15508093 13.62763 -1.440287
Petal.Length 0.4089223 0.07838896 0.01889134 54.93890 21.646019
Petal.Width 0.8885803 0.07293476 0.05137355 65.50552 17.296454
p.value1 p.value2
Sepal.Width 6.469702e-28 1.518983e-01
Petal.Length 2.426713e-100 1.038667e-47
Petal.Width 3.340431e-111 2.325498e-37
2) 如果我们从 glance_tidy
函数定义中删除第一个 unlist 然后我们得到一个二维列表(而不是二维数字矩阵):
glance_tidy_l <- function(x) c(glance(x), unlist(tidy(x)[, -1]))
iris_vars %>% Map(f = make_model) %>% sapply(glance_tidy_l)
Sepal.Width Petal.Length Petal.Width
r.squared 0.01382265 0.7599546 0.6690277
adj.r.squared 0.007159294 0.7583327 0.6667914
sigma 0.8250966 0.4070745 0.4779948
statistic 2.074427 468.5502 299.1673
p.value 0.1518983 1.038667e-47 2.325498e-37
df 2 2 2
logLik -182.9958 -77.02021 -101.1107
AIC 371.9917 160.0404 208.2215
BIC 381.0236 169.0723 217.2534
deviance 100.7561 24.52503 33.81489
df.residual 148 148 148
estimate1 6.526223 4.306603 4.777629
estimate2 -0.2233611 0.4089223 0.8885803
std.error1 0.4788963 0.07838896 0.07293476
std.error2 0.1550809 0.01889134 0.05137355
statistic1 13.62763 54.9389 65.50552
statistic2 -1.440287 21.64602 17.29645
p.value1 6.469702e-28 2.426713e-100 3.340431e-111
p.value2 0.1518983 1.038667e-47 2.325498e-37
如果您准备开始使用列表列设置准嵌套数据框,那么 map/model/unnest/tidy 这一步会非常顺利。
首先,设置您的数据框:
> library(dplyr)
>
> nested_df <- data_frame(data = list(iris %>%
select(response = Sepal.Length,
predictor = Sepal.Width),
iris %>%
select(response = Sepal.Length,
predictor = Petal.Length),
iris %>%
select(response = Sepal.Length,
predictor = Petal.Width)))
>
> nested_df
# A tibble: 3 × 1
data
<list>
1 <data.frame [150 × 2]>
2 <data.frame [150 × 2]>
3 <data.frame [150 × 2]>
现在使用 purrr、tidyr 和 broom 得到建模的结果。
> library(tidyr)
> library(purrr)
> library(broom)
>
> nested_df %>%
mutate(models = map(data, ~ lm(response ~ predictor, .))) %>%
unnest(map(models, tidy))
# A tibble: 6 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 6.5262226 0.47889634 13.627631 6.469702e-28
2 predictor -0.2233611 0.15508093 -1.440287 1.518983e-01
3 (Intercept) 4.3066034 0.07838896 54.938900 2.426713e-100
4 predictor 0.4089223 0.01889134 21.646019 1.038667e-47
5 (Intercept) 4.7776294 0.07293476 65.505517 3.340431e-111
6 predictor 0.8885803 0.05137355 17.296454 2.325498e-37
您可以使用 filter
仅提取斜率 (term == "predictor"
),或者您可以在最后一行代码中使用 glance
而不是 tidy
来获取这些结果。
quasi-nested 数据框的另一个选项是使用 purrr::map_df 来快速处理大量预测变量。这也可以为您提供预测器作为封闭数据框中的一列。然后按照 Julia 的示例来拟合模型。
> library(dplyr)
> library(purrr)
>
> def_nested_df <- function(x) {
data_frame("covariate" = x,
"data" = list(iris %>% tbl_df %>%
select_("response" = "Sepal.Length",
"predictor" = x)))
}
>
> nested_df <-
c("Sepal.Width", "Petal.Length", "Petal.Width") %>%
map_df(def_nested_df)
>
> nested_df
# A tibble: 3 × 2
covariate data
<chr> <list>
1 Sepal.Width <tibble [150 × 2]>
2 Petal.Length <tibble [150 × 2]>
3 Petal.Width <tibble [150 × 2]>
>
> nested_df[[1, "data"]]
# A tibble: 150 × 2
response predictor
<dbl> <dbl>
1 5.1 3.5
2 4.9 3.0
3 4.7 3.2
4 4.6 3.1
5 5.0 3.6
6 5.4 3.9
7 4.6 3.4
8 5.0 3.4
9 4.4 2.9
10 4.9 3.1
# ... with 140 more rows
我的回答在本质上与 Julia Silge 和所见即所得的相似,但我想避免手动输入变量名称,并在模型对象的公式中保留响应和预测变量的名称:
require(tibble)
require(dplyr)
require(tidyr)
require(purrr)
require(broom)
df <- iris
response_var <- "Sepal.Length"
vars <- tibble(response=response_var,
predictor=setdiff(names(df), response_var))
compose_formula <- function(x, y)
as.formula(paste0("~lm(", y, "~", x, ", data=.)"))
models <- tibble(data=list(df)) %>%
crossing(vars) %>%
mutate(fmla = map2(predictor, response, compose_formula),
model = map2(data, fmla, ~at_depth(.x, 0, .y)))
models %>% unnest(map(model, tidy))
models %>% unnest(map(model, glance), .drop=T)
输出:
# A tibble: 9 x 7
response predictor term estimate std.error statistic
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Sepal.Length Sepal.Width (Intercept) 6.5262226 0.47889634 13.627631
2 Sepal.Length Sepal.Width Sepal.Width -0.2233611 0.15508093 -1.440287
3 Sepal.Length Petal.Length (Intercept) 4.3066034 0.07838896 54.938900
4 Sepal.Length Petal.Length Petal.Length 0.4089223 0.01889134 21.646019
5 Sepal.Length Petal.Width (Intercept) 4.7776294 0.07293476 65.505517
6 Sepal.Length Petal.Width Petal.Width 0.8885803 0.05137355 17.296454
7 Sepal.Length Species (Intercept) 5.0060000 0.07280222 68.761639
8 Sepal.Length Species Speciesversicolor 0.9300000 0.10295789 9.032819
9 Sepal.Length Species Speciesvirginica 1.5820000 0.10295789 15.365506
# ... with 1 more variables: p.value <dbl>
和
# A tibble: 4 x 13
response predictor r.squared adj.r.squared sigma statistic
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Sepal.Length Sepal.Width 0.01382265 0.007159294 0.8250966 2.074427
2 Sepal.Length Petal.Length 0.75995465 0.758332718 0.4070745 468.550154
3 Sepal.Length Petal.Width 0.66902769 0.666791387 0.4779948 299.167312
4 Sepal.Length Species 0.61870573 0.613518054 0.5147894 119.264502
# ... with 7 more variables: p.value <dbl>, df <int>, logLik <dbl>, AIC <dbl>,
# BIC <dbl>, deviance <dbl>, df.residual <int>
我有一个包含一列响应变量和几列预测变量的数据框。我想分别使用每个预测变量来拟合响应变量的模型,最后创建一个包含模型系数的数据框。以前,我会这样做:
data(iris)
iris_vars <- c("Sepal.Width", "Petal.Length", "Petal.Width")
fits.iris <- lapply(iris_vars, function(x) {lm(substitute(Sepal.Length ~ i, list(i = as.name(x))), data = iris)})
# extract model coeffs, so forth and so on, eventually combining into a result dataframe
iris.p <- as.data.frame(lapply(fits.iris, function(f) summary(f)$coefficients[,4]))
iris.r <- as.data.frame(lapply(fits.iris, function(f) summary(f)$r.squared))
然而,这似乎有点麻烦,因为我已经开始使用 dplyr
、broom
等。使用 purrr::map
我或多或少可以重新创建这个模型列表:
# using purrr, still uses the Response variable "Sepal.Length" as a predictor of itself
iris %>%
select(1:4) %>%
# names(select(., 2:4)) %>% this does not work
names() %>%
paste('Sepal.Length ~', .) %>%
map(~lm(as.formula(.x), data = iris))
但是,我不确定如何将此列表转换为适合 broom::tidy
的形式。如果我正在使用分组行而不是列,我会存储模型拟合并使用 broom::tidy
来做这样的事情:
iris.fits <- group_by(Species) %>% do(modfit1 = lm(Sepal.Length~Sepal.Width,data=.))
tidy(iris.fits, modfit1)
当然这不是我正在做的,但我希望在使用数据列时有类似的过程。有没有办法,也许使用 purrr::nest
或类似的东西来创建所需的输出?
1) 这给出了模型拟合的 glance
和 tidy
输出:
library(broom)
make_model <- function(nm) lm(iris[c("Sepal.Length", nm)])
fits <- Map(make_model, iris_vars)
glance_tidy <- function(x) c(unlist(glance(x)), unlist(tidy(x)[, -1]))
out <- sapply(fits, glance_tidy)
1a) 或作为 magrittr 管道:
library(magrittr)
out <- iris_vars %>% Map(f = make_model) %>% sapply(glance_tidy)
其中一个给出了以下矩阵:
> out
Sepal.Width Petal.Length Petal.Width
r.squared 1.382265e-02 7.599546e-01 6.690277e-01
adj.r.squared 7.159294e-03 7.583327e-01 6.667914e-01
sigma 8.250966e-01 4.070745e-01 4.779948e-01
statistic 2.074427e+00 4.685502e+02 2.991673e+02
p.value 1.518983e-01 1.038667e-47 2.325498e-37
df 2.000000e+00 2.000000e+00 2.000000e+00
logLik -1.829958e+02 -7.702021e+01 -1.011107e+02
AIC 3.719917e+02 1.600404e+02 2.082215e+02
BIC 3.810236e+02 1.690723e+02 2.172534e+02
deviance 1.007561e+02 2.452503e+01 3.381489e+01
df.residual 1.480000e+02 1.480000e+02 1.480000e+02
estimate1 6.526223e+00 4.306603e+00 4.777629e+00
estimate2 -2.233611e-01 4.089223e-01 8.885803e-01
std.error1 4.788963e-01 7.838896e-02 7.293476e-02
std.error2 1.550809e-01 1.889134e-02 5.137355e-02
statistic1 1.362763e+01 5.493890e+01 6.550552e+01
statistic2 -1.440287e+00 2.164602e+01 1.729645e+01
p.value1 6.469702e-28 2.426713e-100 3.340431e-111
p.value2 1.518983e-01 1.038667e-47 2.325498e-37
或转置:
> t(out)
r.squared adj.r.squared sigma statistic p.value df
Sepal.Width 0.01382265 0.007159294 0.8250966 2.074427 1.518983e-01 2
Petal.Length 0.75995465 0.758332718 0.4070745 468.550154 1.038667e-47 2
Petal.Width 0.66902769 0.666791387 0.4779948 299.167312 2.325498e-37 2
logLik AIC BIC deviance df.residual estimate1
Sepal.Width -182.99584 371.9917 381.0236 100.75610 148 6.526223
Petal.Length -77.02021 160.0404 169.0723 24.52503 148 4.306603
Petal.Width -101.11073 208.2215 217.2534 33.81489 148 4.777629
estimate2 std.error1 std.error2 statistic1 statistic2
Sepal.Width -0.2233611 0.47889634 0.15508093 13.62763 -1.440287
Petal.Length 0.4089223 0.07838896 0.01889134 54.93890 21.646019
Petal.Width 0.8885803 0.07293476 0.05137355 65.50552 17.296454
p.value1 p.value2
Sepal.Width 6.469702e-28 1.518983e-01
Petal.Length 2.426713e-100 1.038667e-47
Petal.Width 3.340431e-111 2.325498e-37
2) 如果我们从 glance_tidy
函数定义中删除第一个 unlist 然后我们得到一个二维列表(而不是二维数字矩阵):
glance_tidy_l <- function(x) c(glance(x), unlist(tidy(x)[, -1]))
iris_vars %>% Map(f = make_model) %>% sapply(glance_tidy_l)
Sepal.Width Petal.Length Petal.Width
r.squared 0.01382265 0.7599546 0.6690277
adj.r.squared 0.007159294 0.7583327 0.6667914
sigma 0.8250966 0.4070745 0.4779948
statistic 2.074427 468.5502 299.1673
p.value 0.1518983 1.038667e-47 2.325498e-37
df 2 2 2
logLik -182.9958 -77.02021 -101.1107
AIC 371.9917 160.0404 208.2215
BIC 381.0236 169.0723 217.2534
deviance 100.7561 24.52503 33.81489
df.residual 148 148 148
estimate1 6.526223 4.306603 4.777629
estimate2 -0.2233611 0.4089223 0.8885803
std.error1 0.4788963 0.07838896 0.07293476
std.error2 0.1550809 0.01889134 0.05137355
statistic1 13.62763 54.9389 65.50552
statistic2 -1.440287 21.64602 17.29645
p.value1 6.469702e-28 2.426713e-100 3.340431e-111
p.value2 0.1518983 1.038667e-47 2.325498e-37
如果您准备开始使用列表列设置准嵌套数据框,那么 map/model/unnest/tidy 这一步会非常顺利。
首先,设置您的数据框:
> library(dplyr)
>
> nested_df <- data_frame(data = list(iris %>%
select(response = Sepal.Length,
predictor = Sepal.Width),
iris %>%
select(response = Sepal.Length,
predictor = Petal.Length),
iris %>%
select(response = Sepal.Length,
predictor = Petal.Width)))
>
> nested_df
# A tibble: 3 × 1
data
<list>
1 <data.frame [150 × 2]>
2 <data.frame [150 × 2]>
3 <data.frame [150 × 2]>
现在使用 purrr、tidyr 和 broom 得到建模的结果。
> library(tidyr)
> library(purrr)
> library(broom)
>
> nested_df %>%
mutate(models = map(data, ~ lm(response ~ predictor, .))) %>%
unnest(map(models, tidy))
# A tibble: 6 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 6.5262226 0.47889634 13.627631 6.469702e-28
2 predictor -0.2233611 0.15508093 -1.440287 1.518983e-01
3 (Intercept) 4.3066034 0.07838896 54.938900 2.426713e-100
4 predictor 0.4089223 0.01889134 21.646019 1.038667e-47
5 (Intercept) 4.7776294 0.07293476 65.505517 3.340431e-111
6 predictor 0.8885803 0.05137355 17.296454 2.325498e-37
您可以使用 filter
仅提取斜率 (term == "predictor"
),或者您可以在最后一行代码中使用 glance
而不是 tidy
来获取这些结果。
quasi-nested 数据框的另一个选项是使用 purrr::map_df 来快速处理大量预测变量。这也可以为您提供预测器作为封闭数据框中的一列。然后按照 Julia 的示例来拟合模型。
> library(dplyr)
> library(purrr)
>
> def_nested_df <- function(x) {
data_frame("covariate" = x,
"data" = list(iris %>% tbl_df %>%
select_("response" = "Sepal.Length",
"predictor" = x)))
}
>
> nested_df <-
c("Sepal.Width", "Petal.Length", "Petal.Width") %>%
map_df(def_nested_df)
>
> nested_df
# A tibble: 3 × 2
covariate data
<chr> <list>
1 Sepal.Width <tibble [150 × 2]>
2 Petal.Length <tibble [150 × 2]>
3 Petal.Width <tibble [150 × 2]>
>
> nested_df[[1, "data"]]
# A tibble: 150 × 2
response predictor
<dbl> <dbl>
1 5.1 3.5
2 4.9 3.0
3 4.7 3.2
4 4.6 3.1
5 5.0 3.6
6 5.4 3.9
7 4.6 3.4
8 5.0 3.4
9 4.4 2.9
10 4.9 3.1
# ... with 140 more rows
我的回答在本质上与 Julia Silge 和所见即所得的相似,但我想避免手动输入变量名称,并在模型对象的公式中保留响应和预测变量的名称:
require(tibble)
require(dplyr)
require(tidyr)
require(purrr)
require(broom)
df <- iris
response_var <- "Sepal.Length"
vars <- tibble(response=response_var,
predictor=setdiff(names(df), response_var))
compose_formula <- function(x, y)
as.formula(paste0("~lm(", y, "~", x, ", data=.)"))
models <- tibble(data=list(df)) %>%
crossing(vars) %>%
mutate(fmla = map2(predictor, response, compose_formula),
model = map2(data, fmla, ~at_depth(.x, 0, .y)))
models %>% unnest(map(model, tidy))
models %>% unnest(map(model, glance), .drop=T)
输出:
# A tibble: 9 x 7
response predictor term estimate std.error statistic
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Sepal.Length Sepal.Width (Intercept) 6.5262226 0.47889634 13.627631
2 Sepal.Length Sepal.Width Sepal.Width -0.2233611 0.15508093 -1.440287
3 Sepal.Length Petal.Length (Intercept) 4.3066034 0.07838896 54.938900
4 Sepal.Length Petal.Length Petal.Length 0.4089223 0.01889134 21.646019
5 Sepal.Length Petal.Width (Intercept) 4.7776294 0.07293476 65.505517
6 Sepal.Length Petal.Width Petal.Width 0.8885803 0.05137355 17.296454
7 Sepal.Length Species (Intercept) 5.0060000 0.07280222 68.761639
8 Sepal.Length Species Speciesversicolor 0.9300000 0.10295789 9.032819
9 Sepal.Length Species Speciesvirginica 1.5820000 0.10295789 15.365506
# ... with 1 more variables: p.value <dbl>
和
# A tibble: 4 x 13
response predictor r.squared adj.r.squared sigma statistic
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Sepal.Length Sepal.Width 0.01382265 0.007159294 0.8250966 2.074427
2 Sepal.Length Petal.Length 0.75995465 0.758332718 0.4070745 468.550154
3 Sepal.Length Petal.Width 0.66902769 0.666791387 0.4779948 299.167312
4 Sepal.Length Species 0.61870573 0.613518054 0.5147894 119.264502
# ... with 7 more variables: p.value <dbl>, df <int>, logLik <dbl>, AIC <dbl>,
# BIC <dbl>, deviance <dbl>, df.residual <int>