提取 R 中多个不变自变量的系数和 p 值列表
Extract lists of coefficients and p-values for multiple invariant independent variables in R
我尝试用同一个模型做 1104 次线性回归。我的自变量没有改变。但是,我的因变量确实如此。事实上,我有 1104 个因变量。我不知道如何提取所有系数(包括截距)和 p 值以计算每个系数(系数和 p 值)的均值。如何用简单的方法做到这一点?这是我的模型:
testMCFG1 <- lapply(101:1204, function(i) lm(recexp[,i]~recexp[,"rm"] + recexp[,"zdy"] + recexp[,"ztbl"] + recexp[,"ztms"] + recexp[,"zdfy"] + recexp[,"rm_zdy"] + recexp[,"rm_ztbl"] + recexp[,"rm_ztms"] + recexp[,"rm_zdfy"] + recexp[,"contexte"] + recexp[,"rm_contexte"]))
但是,这里有人已经向我展示了如何只使用一个不变的自变量来做到这一点。这样可行。在下面找到此案例的代码:
y <- 'rm'
x <- names(recexp[101:1204])
models <- map(setNames(x, x),
~ lm(as.formula(paste(.x, y, sep="~")),
data=recexp))
pvalues <-
data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
rownames_to_column(var = "which_dependent")
results <- full_join(basic_information, pvalues)
results %>% group_by(term) %>% summarise(mean_estimate = mean(estimate))
results %>% group_by(term) %>% summarise(mean_p = mean(p.value))
这是一个使用多个 tidyverse 包的解决方案。您不提供您的数据,因此我将使用 mtcars
作为示例。将您的独立变量放入一个名为 independents
的固定字符串中,我们将使用切片获取您的依赖项,就像您使用代码生成字符向量
一样
#####
independents <- 'mpg + vs + am + gear'
dependent <- names(mtcars[2:7])
加载库
library(dplyr)
library(purrr)
library(broom)
library(tidyr)
library(tibble)
使用 purrr::map
列出所有模型
models <- map(setNames(dependent, dependent),
~ lm(as.formula(paste(.x, independents, sep="~")),
data=mtcars))
获取 lm
模型列表并将其提供给 broom::tidy
以提取有关 beta 估计值和 p 值等的基本信息。为了保持整洁,请使用列表项的名称 (这是因变量)并将其添加为一列。从拦截中删除括号并添加一个零,所以它总是第一个,你知道它是 beta0
basics <-
map(models, ~ broom::tidy(.)) %>%
map2_df(.,
names(.),
~ mutate(.x, which_dependent = .y)) %>%
select(which_dependent, everything()) %>%
mutate(term = gsub("\(Intercept\)", "0Intercept", term))
这次再次输入列表,提取 r 平方和 sigma a.k.a。 “剩余标准误差”
model_summary <-
data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
rownames_to_column(var = "which_dependent")
根据哪个因变量加入两者
results <- full_join(basics, model_summary)
#> Joining, by = "which_dependent"
results
#> # A tibble: 30 x 8
#> which_dependent term estimate std.error statistic p.value rsquared RSE
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 cyl 0Inter… 10.4 1.14 9.13 9.58e-10 0.861 0.714
#> 2 cyl mpg -0.117 0.0382 -3.06 4.98e- 3 0.861 0.714
#> 3 cyl vs -1.80 0.374 -4.81 5.09e- 5 0.861 0.714
#> 4 cyl am -0.414 0.502 -0.826 4.16e- 1 0.861 0.714
#> 5 cyl gear -0.258 0.290 -0.891 3.81e- 1 0.861 0.714
#> 6 disp 0Inter… 571. 94.1 6.07 1.76e- 6 0.804 58.8
#> 7 disp mpg -9.50 3.14 -3.02 5.47e- 3 0.804 58.8
#> 8 disp vs -85.9 30.8 -2.79 9.49e- 3 0.804 58.8
#> 9 disp am -31.9 41.3 -0.774 4.45e- 1 0.804 58.8
#> 10 disp gear -26.8 23.9 -1.12 2.71e- 1 0.804 58.8
#> # … with 20 more rows
它是长格式的,因此您可以执行按 term
分组的汇总等操作
results %>%
group_by(term) %>%
summarise(mean_p = mean(p.value)) %>%
arrange(term)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 5 x 2
#> term mean_p
#> <chr> <dbl>
#> 1 0Intercept 0.000168
#> 2 am 0.359
#> 3 gear 0.287
#> 4 mpg 0.0538
#> 5 vs 0.159
或者如果你愿意,你可以把它加宽...
wide_results <-
results %>%
pivot_wider(names_from = term,
values_from = estimate:p.value)
wide_results
#> # A tibble: 6 x 23
#> which_dependent rsquared RSE estimate_0Inter… estimate_mpg estimate_vs
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 cyl 0.861 0.714 10.4 -0.117 -1.80
#> 2 disp 0.804 58.8 571. -9.50 -85.9
#> 3 hp 0.736 37.7 241. -8.17 -41.4
#> 4 drat 0.667 0.331 2.07 0.0228 0.166
#> 5 wt 0.804 0.464 5.90 -0.104 -0.146
#> 6 qsec 0.734 0.988 17.5 0.0894 2.29
#> # … with 17 more variables: estimate_am <dbl>, estimate_gear <dbl>,
#> # std.error_0Intercept <dbl>, std.error_mpg <dbl>, std.error_vs <dbl>,
#> # std.error_am <dbl>, std.error_gear <dbl>, statistic_0Intercept <dbl>,
#> # statistic_mpg <dbl>, statistic_vs <dbl>, statistic_am <dbl>,
#> # statistic_gear <dbl>, p.value_0Intercept <dbl>, p.value_mpg <dbl>,
#> # p.value_vs <dbl>, p.value_am <dbl>, p.value_gear <dbl>
names(wide_results)
#> [1] "which_dependent" "rsquared" "RSE"
#> [4] "estimate_0Intercept" "estimate_mpg" "estimate_vs"
#> [7] "estimate_am" "estimate_gear" "std.error_0Intercept"
#> [10] "std.error_mpg" "std.error_vs" "std.error_am"
#> [13] "std.error_gear" "statistic_0Intercept" "statistic_mpg"
#> [16] "statistic_vs" "statistic_am" "statistic_gear"
#> [19] "p.value_0Intercept" "p.value_mpg" "p.value_vs"
#> [22] "p.value_am" "p.value_gear"
我尝试用同一个模型做 1104 次线性回归。我的自变量没有改变。但是,我的因变量确实如此。事实上,我有 1104 个因变量。我不知道如何提取所有系数(包括截距)和 p 值以计算每个系数(系数和 p 值)的均值。如何用简单的方法做到这一点?这是我的模型:
testMCFG1 <- lapply(101:1204, function(i) lm(recexp[,i]~recexp[,"rm"] + recexp[,"zdy"] + recexp[,"ztbl"] + recexp[,"ztms"] + recexp[,"zdfy"] + recexp[,"rm_zdy"] + recexp[,"rm_ztbl"] + recexp[,"rm_ztms"] + recexp[,"rm_zdfy"] + recexp[,"contexte"] + recexp[,"rm_contexte"]))
但是,这里有人已经向我展示了如何只使用一个不变的自变量来做到这一点。这样可行。在下面找到此案例的代码:
y <- 'rm'
x <- names(recexp[101:1204])
models <- map(setNames(x, x),
~ lm(as.formula(paste(.x, y, sep="~")),
data=recexp))
pvalues <-
data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
rownames_to_column(var = "which_dependent")
results <- full_join(basic_information, pvalues)
results %>% group_by(term) %>% summarise(mean_estimate = mean(estimate))
results %>% group_by(term) %>% summarise(mean_p = mean(p.value))
这是一个使用多个 tidyverse 包的解决方案。您不提供您的数据,因此我将使用 mtcars
作为示例。将您的独立变量放入一个名为 independents
的固定字符串中,我们将使用切片获取您的依赖项,就像您使用代码生成字符向量
#####
independents <- 'mpg + vs + am + gear'
dependent <- names(mtcars[2:7])
加载库
library(dplyr)
library(purrr)
library(broom)
library(tidyr)
library(tibble)
使用 purrr::map
models <- map(setNames(dependent, dependent),
~ lm(as.formula(paste(.x, independents, sep="~")),
data=mtcars))
获取 lm
模型列表并将其提供给 broom::tidy
以提取有关 beta 估计值和 p 值等的基本信息。为了保持整洁,请使用列表项的名称 (这是因变量)并将其添加为一列。从拦截中删除括号并添加一个零,所以它总是第一个,你知道它是 beta0
basics <-
map(models, ~ broom::tidy(.)) %>%
map2_df(.,
names(.),
~ mutate(.x, which_dependent = .y)) %>%
select(which_dependent, everything()) %>%
mutate(term = gsub("\(Intercept\)", "0Intercept", term))
这次再次输入列表,提取 r 平方和 sigma a.k.a。 “剩余标准误差”
model_summary <-
data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
rownames_to_column(var = "which_dependent")
根据哪个因变量加入两者
results <- full_join(basics, model_summary)
#> Joining, by = "which_dependent"
results
#> # A tibble: 30 x 8
#> which_dependent term estimate std.error statistic p.value rsquared RSE
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 cyl 0Inter… 10.4 1.14 9.13 9.58e-10 0.861 0.714
#> 2 cyl mpg -0.117 0.0382 -3.06 4.98e- 3 0.861 0.714
#> 3 cyl vs -1.80 0.374 -4.81 5.09e- 5 0.861 0.714
#> 4 cyl am -0.414 0.502 -0.826 4.16e- 1 0.861 0.714
#> 5 cyl gear -0.258 0.290 -0.891 3.81e- 1 0.861 0.714
#> 6 disp 0Inter… 571. 94.1 6.07 1.76e- 6 0.804 58.8
#> 7 disp mpg -9.50 3.14 -3.02 5.47e- 3 0.804 58.8
#> 8 disp vs -85.9 30.8 -2.79 9.49e- 3 0.804 58.8
#> 9 disp am -31.9 41.3 -0.774 4.45e- 1 0.804 58.8
#> 10 disp gear -26.8 23.9 -1.12 2.71e- 1 0.804 58.8
#> # … with 20 more rows
它是长格式的,因此您可以执行按 term
results %>%
group_by(term) %>%
summarise(mean_p = mean(p.value)) %>%
arrange(term)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 5 x 2
#> term mean_p
#> <chr> <dbl>
#> 1 0Intercept 0.000168
#> 2 am 0.359
#> 3 gear 0.287
#> 4 mpg 0.0538
#> 5 vs 0.159
或者如果你愿意,你可以把它加宽...
wide_results <-
results %>%
pivot_wider(names_from = term,
values_from = estimate:p.value)
wide_results
#> # A tibble: 6 x 23
#> which_dependent rsquared RSE estimate_0Inter… estimate_mpg estimate_vs
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 cyl 0.861 0.714 10.4 -0.117 -1.80
#> 2 disp 0.804 58.8 571. -9.50 -85.9
#> 3 hp 0.736 37.7 241. -8.17 -41.4
#> 4 drat 0.667 0.331 2.07 0.0228 0.166
#> 5 wt 0.804 0.464 5.90 -0.104 -0.146
#> 6 qsec 0.734 0.988 17.5 0.0894 2.29
#> # … with 17 more variables: estimate_am <dbl>, estimate_gear <dbl>,
#> # std.error_0Intercept <dbl>, std.error_mpg <dbl>, std.error_vs <dbl>,
#> # std.error_am <dbl>, std.error_gear <dbl>, statistic_0Intercept <dbl>,
#> # statistic_mpg <dbl>, statistic_vs <dbl>, statistic_am <dbl>,
#> # statistic_gear <dbl>, p.value_0Intercept <dbl>, p.value_mpg <dbl>,
#> # p.value_vs <dbl>, p.value_am <dbl>, p.value_gear <dbl>
names(wide_results)
#> [1] "which_dependent" "rsquared" "RSE"
#> [4] "estimate_0Intercept" "estimate_mpg" "estimate_vs"
#> [7] "estimate_am" "estimate_gear" "std.error_0Intercept"
#> [10] "std.error_mpg" "std.error_vs" "std.error_am"
#> [13] "std.error_gear" "statistic_0Intercept" "statistic_mpg"
#> [16] "statistic_vs" "statistic_am" "statistic_gear"
#> [19] "p.value_0Intercept" "p.value_mpg" "p.value_vs"
#> [22] "p.value_am" "p.value_gear"