运行 n 个线性回归,只需几行代码并将结果存储在矩阵中
Running n linear regressions with few lines of code and storing results in a matrix
我有 25 个因变量 (db[,2:26]
) 的小标题 db
和一个解释变量的向量 rmrf
。我想要做的就是 运行 对同一个共同解释变量的 25 个因变量中的每一个进行回归。
我想获得一个 table 的 alpha、betas,t.stat 的 alpha 和 R2,因此是一个 25 行(每个因变量一行)和 4 列的矩阵。
然而,尽管这似乎是一个非常简单的问题(我是R的新手),但我不明白:
- 如何在几行代码中巧妙地 运行 所有 25 个回归 [
loop
, apply
?]
- 如何提取所需的 4 个数量。
虽然对于第一个问题我可能有一个解决方案(虽然不确定!):
varlist <- names(db)[2:26] #the 25 dependent variables
models <- lapply(varlist, function(x) {
lm(substitute(i ~ rmrf, list(i = as.name(x))), data = db)
})
第二个我还不知道(除了用了lm
class的函数coefficient()
,但还是不能把其他2个量合起来)
你能帮我解决这个问题吗?
lm
跨因变量向量化:
随心所欲
lm(as.matrix(db[,-1]) ~ rmrf, data = db)
例如。让我们以 iris 数据集为例,如果我们取 Petal.Width
是自变量而前 3 个变量是因变量,那么我们可以这样做:
dat <- iris[-5]
library(tidyverse)
library(broom)
lm(as.matrix(dat[-4]) ~ Petal.Width, dat) %>%
{cbind.data.frame(tidy(.)%>%
pivot_wider(response, names_from = term,
values_from = c(estimate, statistic)),
R.sq = map_dbl(summary(.),~.x$r.squared))}%>%
`rownames<-`(NULL)
response estimate_(Intercept) estimate_Petal.Width statistic_(Intercept) statistic_Petal.Width R.sq
1 Sepal.Length 4.777629 0.8885803 65.50552 17.296454 0.6690277
2 Sepal.Width 3.308426 -0.2093598 53.27795 -4.786461 0.1340482
3 Petal.Length 1.083558 2.2299405 14.84998 43.387237 0.9271098
如果我没看错,你想为数据集中的每一对独立~依赖应用 LM。您可以像这样使用 pivot/nest/broom 策略:
library(tidyverse)
library(broom)
# creating some dataset
db <- tibble(
y = rnorm(5),
x1 = rnorm(5),
x2 = rnorm(5),
x3 = rnorm(5)
)
# lets see
head(db)
# A tibble: 5 x 4
y x1 x2 x3
<dbl> <dbl> <dbl> <dbl>
1 -0.994 0.139 -0.935 0.0134
2 1.09 0.960 1.23 1.45
3 1.03 0.374 1.06 -0.900
4 1.63 -0.162 -0.498 -0.740
5 -0.0941 1.47 0.312 0.933
# pivot to long format by "independend var"
db_pivot <- db %>%
gather(key = "var_name", value = "value", -y)
head(db_pivot)
# A tibble: 6 x 3
y var_name value
<dbl> <chr> <dbl>
1 -0.368 x1 -1.29
2 -1.48 x1 -0.0813
3 -2.61 x1 0.477
4 0.602 x1 -0.525
5 -0.264 x1 0.0598
6 -0.368 x2 -0.573
# pipeline
resp <- db_pivot %>%
group_by(var_name) %>% # for each var group
nest() %>% # nest the dataset
mutate(lm_model=map(data,function(.x){ # apply lm for each dataset
lm(y~., data=.x)
})) %>%
mutate( # for each lm model fitted
coef_stats = map(lm_model, tidy), # use broom to extract coef statistics from lm model
model_stats = map(lm_model, glance) # use broom to extract regression stats from lm model
)
head(resp)
# A tibble: 3 x 5
# Groups: var_name [3]
var_name data lm_model coef_stats model_stats
<chr> <list> <list> <list> <list>
1 x1 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
2 x2 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
3 x3 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
# coefs
resp %>%
unnest(coef_stats) %>%
select(-data,-lm_model, -model_stats)
# A tibble: 6 x 6
# Groups: var_name [3]
var_name term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 x1 (Intercept) -1.14 0.548 -2.08 0.129
2 x1 value -1.16 0.829 -1.40 0.257
3 x2 (Intercept) -0.404 0.372 -1.09 0.356
4 x2 value -0.985 0.355 -2.77 0.0694
5 x3 (Intercept) -0.707 0.755 -0.936 0.418
6 x3 value -0.206 0.725 -0.284 0.795
# R2
resp %>%
unnest(model_stats) %>%
select(-data,-lm_model, -coef_stats)
# A tibble: 3 x 12
# Groups: var_name [3]
var_name r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
1 x1 0.394 0.192 1.12 1.95 0.257 2 -6.37 18.7 17.6 3.74 3
2 x2 0.719 0.626 0.760 7.69 0.0694 2 -4.44 14.9 13.7 1.73 3
3 x3 0.0261 -0.298 1.42 0.0805 0.795 2 -7.55 21.1 19.9 6.01 3
我有 25 个因变量 (db[,2:26]
) 的小标题 db
和一个解释变量的向量 rmrf
。我想要做的就是 运行 对同一个共同解释变量的 25 个因变量中的每一个进行回归。
我想获得一个 table 的 alpha、betas,t.stat 的 alpha 和 R2,因此是一个 25 行(每个因变量一行)和 4 列的矩阵。
然而,尽管这似乎是一个非常简单的问题(我是R的新手),但我不明白:
- 如何在几行代码中巧妙地 运行 所有 25 个回归 [
loop
,apply
?] - 如何提取所需的 4 个数量。
虽然对于第一个问题我可能有一个解决方案(虽然不确定!):
varlist <- names(db)[2:26] #the 25 dependent variables
models <- lapply(varlist, function(x) {
lm(substitute(i ~ rmrf, list(i = as.name(x))), data = db)
})
第二个我还不知道(除了用了lm
class的函数coefficient()
,但还是不能把其他2个量合起来)
你能帮我解决这个问题吗?
lm
跨因变量向量化:
随心所欲
lm(as.matrix(db[,-1]) ~ rmrf, data = db)
例如。让我们以 iris 数据集为例,如果我们取 Petal.Width
是自变量而前 3 个变量是因变量,那么我们可以这样做:
dat <- iris[-5]
library(tidyverse)
library(broom)
lm(as.matrix(dat[-4]) ~ Petal.Width, dat) %>%
{cbind.data.frame(tidy(.)%>%
pivot_wider(response, names_from = term,
values_from = c(estimate, statistic)),
R.sq = map_dbl(summary(.),~.x$r.squared))}%>%
`rownames<-`(NULL)
response estimate_(Intercept) estimate_Petal.Width statistic_(Intercept) statistic_Petal.Width R.sq
1 Sepal.Length 4.777629 0.8885803 65.50552 17.296454 0.6690277
2 Sepal.Width 3.308426 -0.2093598 53.27795 -4.786461 0.1340482
3 Petal.Length 1.083558 2.2299405 14.84998 43.387237 0.9271098
如果我没看错,你想为数据集中的每一对独立~依赖应用 LM。您可以像这样使用 pivot/nest/broom 策略:
library(tidyverse)
library(broom)
# creating some dataset
db <- tibble(
y = rnorm(5),
x1 = rnorm(5),
x2 = rnorm(5),
x3 = rnorm(5)
)
# lets see
head(db)
# A tibble: 5 x 4
y x1 x2 x3
<dbl> <dbl> <dbl> <dbl>
1 -0.994 0.139 -0.935 0.0134
2 1.09 0.960 1.23 1.45
3 1.03 0.374 1.06 -0.900
4 1.63 -0.162 -0.498 -0.740
5 -0.0941 1.47 0.312 0.933
# pivot to long format by "independend var"
db_pivot <- db %>%
gather(key = "var_name", value = "value", -y)
head(db_pivot)
# A tibble: 6 x 3
y var_name value
<dbl> <chr> <dbl>
1 -0.368 x1 -1.29
2 -1.48 x1 -0.0813
3 -2.61 x1 0.477
4 0.602 x1 -0.525
5 -0.264 x1 0.0598
6 -0.368 x2 -0.573
# pipeline
resp <- db_pivot %>%
group_by(var_name) %>% # for each var group
nest() %>% # nest the dataset
mutate(lm_model=map(data,function(.x){ # apply lm for each dataset
lm(y~., data=.x)
})) %>%
mutate( # for each lm model fitted
coef_stats = map(lm_model, tidy), # use broom to extract coef statistics from lm model
model_stats = map(lm_model, glance) # use broom to extract regression stats from lm model
)
head(resp)
# A tibble: 3 x 5
# Groups: var_name [3]
var_name data lm_model coef_stats model_stats
<chr> <list> <list> <list> <list>
1 x1 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
2 x2 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
3 x3 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
# coefs
resp %>%
unnest(coef_stats) %>%
select(-data,-lm_model, -model_stats)
# A tibble: 6 x 6
# Groups: var_name [3]
var_name term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 x1 (Intercept) -1.14 0.548 -2.08 0.129
2 x1 value -1.16 0.829 -1.40 0.257
3 x2 (Intercept) -0.404 0.372 -1.09 0.356
4 x2 value -0.985 0.355 -2.77 0.0694
5 x3 (Intercept) -0.707 0.755 -0.936 0.418
6 x3 value -0.206 0.725 -0.284 0.795
# R2
resp %>%
unnest(model_stats) %>%
select(-data,-lm_model, -coef_stats)
# A tibble: 3 x 12
# Groups: var_name [3]
var_name r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
1 x1 0.394 0.192 1.12 1.95 0.257 2 -6.37 18.7 17.6 3.74 3
2 x2 0.719 0.626 0.760 7.69 0.0694 2 -4.44 14.9 13.7 1.73 3
3 x3 0.0261 -0.298 1.42 0.0805 0.795 2 -7.55 21.1 19.9 6.01 3