提取 R 中多个不变自变量的系数和 p 值列表

Question

我尝试用同一个模型做 1104 次线性回归。我的自变量没有改变。但是，我的因变量确实如此。事实上，我有 1104 个因变量。我不知道如何提取所有系数（包括截距）和 p 值以计算每个系数（系数和 p 值）的均值。如何用简单的方法做到这一点？这是我的模型：

testMCFG1 <- lapply(101:1204, function(i) lm(recexp[,i]~recexp[,"rm"] + recexp[,"zdy"] + recexp[,"ztbl"] + recexp[,"ztms"] + recexp[,"zdfy"] + recexp[,"rm_zdy"] + recexp[,"rm_ztbl"] + recexp[,"rm_ztms"] + recexp[,"rm_zdfy"] + recexp[,"contexte"] + recexp[,"rm_contexte"]))

但是，这里有人已经向我展示了如何只使用一个不变的自变量来做到这一点。这样可行。在下面找到此案例的代码：

y <- 'rm'


x <- names(recexp[101:1204])

models <- map(setNames(x, x),
              ~ lm(as.formula(paste(.x, y, sep="~")),
                   data=recexp))

pvalues <-
  data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
             RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
  rownames_to_column(var = "which_dependent")

results <- full_join(basic_information, pvalues)

results %>% group_by(term) %>% summarise(mean_estimate = mean(estimate))

results %>% group_by(term) %>% summarise(mean_p = mean(p.value))

Answer 1

这是一个使用多个 tidyverse 包的解决方案。您不提供您的数据，因此我将使用 mtcars 作为示例。将您的独立变量放入一个名为 independents 的固定字符串中，我们将使用切片获取您的依赖项，就像您使用代码生成字符向量

一样

#####
independents <- 'mpg + vs + am + gear'
dependent <- names(mtcars[2:7])

加载库

library(dplyr)
library(purrr)
library(broom)
library(tidyr)
library(tibble)

使用 purrr::map

列出所有模型

models <- map(setNames(dependent, dependent),
              ~ lm(as.formula(paste(.x, independents, sep="~")),
                   data=mtcars))

获取 lm 模型列表并将其提供给 broom::tidy 以提取有关 beta 估计值和 p 值等的基本信息。为了保持整洁，请使用列表项的名称 (这是因变量）并将其添加为一列。从拦截中删除括号并添加一个零，所以它总是第一个，你知道它是 beta0

basics <-
   map(models, ~ broom::tidy(.)) %>%
   map2_df(.,
           names(.),
           ~ mutate(.x, which_dependent = .y)) %>%
   select(which_dependent, everything()) %>%
   mutate(term = gsub("\(Intercept\)", "0Intercept", term))

这次再次输入列表，提取 r 平方和 sigma a.k.a。 “剩余标准误差”

model_summary <-
   data.frame(rsquared = unlist(map(models, ~ summary(.)$r.squared)),
              RSE = unlist(map(models, ~ summary(.)$sigma))) %>%
   rownames_to_column(var = "which_dependent")

根据哪个因变量加入两者

results <- full_join(basics, model_summary)
#> Joining, by = "which_dependent"
results
#> # A tibble: 30 x 8
#>    which_dependent term    estimate std.error statistic  p.value rsquared    RSE
#>    <chr>           <chr>      <dbl>     <dbl>     <dbl>    <dbl>    <dbl>  <dbl>
#>  1 cyl             0Inter…   10.4      1.14       9.13  9.58e-10    0.861  0.714
#>  2 cyl             mpg       -0.117    0.0382    -3.06  4.98e- 3    0.861  0.714
#>  3 cyl             vs        -1.80     0.374     -4.81  5.09e- 5    0.861  0.714
#>  4 cyl             am        -0.414    0.502     -0.826 4.16e- 1    0.861  0.714
#>  5 cyl             gear      -0.258    0.290     -0.891 3.81e- 1    0.861  0.714
#>  6 disp            0Inter…  571.      94.1        6.07  1.76e- 6    0.804 58.8  
#>  7 disp            mpg       -9.50     3.14      -3.02  5.47e- 3    0.804 58.8  
#>  8 disp            vs       -85.9     30.8       -2.79  9.49e- 3    0.804 58.8  
#>  9 disp            am       -31.9     41.3       -0.774 4.45e- 1    0.804 58.8  
#> 10 disp            gear     -26.8     23.9       -1.12  2.71e- 1    0.804 58.8  
#> # … with 20 more rows

它是长格式的，因此您可以执行按 term

分组的汇总等操作

results %>%
   group_by(term) %>%
   summarise(mean_p = mean(p.value)) %>%
   arrange(term)
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 5 x 2
#>   term         mean_p
#>   <chr>         <dbl>
#> 1 0Intercept 0.000168
#> 2 am         0.359   
#> 3 gear       0.287   
#> 4 mpg        0.0538  
#> 5 vs         0.159

或者如果你愿意，你可以把它加宽...

wide_results <-
   results %>%
   pivot_wider(names_from = term,
               values_from = estimate:p.value)
wide_results
#> # A tibble: 6 x 23
#>   which_dependent rsquared    RSE estimate_0Inter… estimate_mpg estimate_vs
#>   <chr>              <dbl>  <dbl>            <dbl>        <dbl>       <dbl>
#> 1 cyl                0.861  0.714            10.4       -0.117       -1.80 
#> 2 disp               0.804 58.8             571.        -9.50       -85.9  
#> 3 hp                 0.736 37.7             241.        -8.17       -41.4  
#> 4 drat               0.667  0.331             2.07       0.0228       0.166
#> 5 wt                 0.804  0.464             5.90      -0.104       -0.146
#> 6 qsec               0.734  0.988            17.5        0.0894       2.29 
#> # … with 17 more variables: estimate_am <dbl>, estimate_gear <dbl>,
#> #   std.error_0Intercept <dbl>, std.error_mpg <dbl>, std.error_vs <dbl>,
#> #   std.error_am <dbl>, std.error_gear <dbl>, statistic_0Intercept <dbl>,
#> #   statistic_mpg <dbl>, statistic_vs <dbl>, statistic_am <dbl>,
#> #   statistic_gear <dbl>, p.value_0Intercept <dbl>, p.value_mpg <dbl>,
#> #   p.value_vs <dbl>, p.value_am <dbl>, p.value_gear <dbl>
names(wide_results)
#>  [1] "which_dependent"      "rsquared"             "RSE"                 
#>  [4] "estimate_0Intercept"  "estimate_mpg"         "estimate_vs"         
#>  [7] "estimate_am"          "estimate_gear"        "std.error_0Intercept"
#> [10] "std.error_mpg"        "std.error_vs"         "std.error_am"        
#> [13] "std.error_gear"       "statistic_0Intercept" "statistic_mpg"       
#> [16] "statistic_vs"         "statistic_am"         "statistic_gear"      
#> [19] "p.value_0Intercept"   "p.value_mpg"          "p.value_vs"          
#> [22] "p.value_am"           "p.value_gear"

提取 R 中多个不变自变量的系数和 p 值列表

Extract lists of coefficients and p-values for multiple invariant independent variables in R

r

linear-regression

p-value