如何先 group_by() 然后迭代 lm() 通过列?
How to first group_by() then iterate lm() through columns?
假设我们有一个数据框,其中包含一组由分组变量标记的 3 个因变量和 6 个自变量。使用以下示例代码生成此格式的示例:
library(tidyverse)
library(broom)
n <- 15
df <- data.frame(groupingvar= sample(letters[1:2], size = n, replace = TRUE),
y1 = rnorm(n,10,1), y2=rnorm(n,100,10), y3=rnorm(n,1000,100),
x1= rnorm(n,10,1), x2=rnorm(n,10,1), x3=rnorm(n,10,1),
x4=rnorm(n,10,1), x5=rnorm(n,10,1), x6=rnorm(n,10,1))
df <- arrange(df,groupingvar)
如果我想对 x1 到 x6 的集合中的每个 y1、y2、y3 进行回归,我可以使用类似以下内容的方法:
y <- as.matrix(select(df,y1:y3))
x <- as.matrix(select(df,x1:x6))
regs <-lm(y~x)
coeffs <- tidy(regs)
coeffs <- arrange(coeffs,response, term)
(通过使用 lm() 帮助中的以下行:"If response is a matrix, a linear model is fitted separately by least-squares to each column of the matrix.")
但是,如果我需要先按分组变量进行分组,然后再应用 lm 函数,那么我不太确定该怎么做。我尝试了以下方法,但它为两组生成了相同的系数集。
regs2 <- df %>% group_by(groupingvar) %>%
do(fit2 = lm(as.matrix(select(df,y1:y3)) ~ as.matrix(select(df,x1:x6))))
coeffs2 <- tidy(regs2,fit2)
coeffs2 <- arrange(coeffs2,groupingvar, response)
在 data.table
中,您可以 melt
(重塑 long -- 将结果变量堆叠在一列中而不是存储在三列中)& lm
groupingvar
和结果变量:
library(data.table)
setDT(df)
#alternatively, set id.vars = c('groupingvar', paste0('x', 1:6)), etc.
longDT = melt(df, id.vars = grep('y', names(df), invert = TRUE))
#this helper function basically splits a named vector into
# its two components
coefsplit = function(reg) {
beta = coef(reg)
list(var = names(beta), coef = beta)
}
#I personally wouldn't assign longDT, I'd just chain this onto
# the output of melt;
longDT[ , coefsplit(lm(value ~ ., data = .SD)), by = .(groupingvar, variable)]
# groupingvar variable var coef
# 1: a y1 (Intercept) -3.595564e+03
# 2: a y1 x1 -3.796627e+01
# 3: a y1 x2 -1.557268e+02
# 4: a y1 x3 2.862738e+02
# 5: a y1 x4 1.579548e+02
# ...
# 38: b y3 x2 2.136253e+01
# 39: b y3 x3 -3.810176e+01
# 40: b y3 x4 4.187719e+01
# 41: b y3 x5 -2.586184e+02
# 42: b y3 x6 1.181879e+02
# groupingvar variable var coef
我还找到了一种使用 cbind() 实现此目的的方法,如下所示:
library(tidyverse)
library(broom)
n <- 20
df4 <- data.frame(groupingvar= sample(1:2, size = n, replace = TRUE),
y1 = rnorm(n,10,1), y2=rnorm(n,100,10), y3=rnorm(n,1000,100),
x1= rnorm(n,10,1), x2=rnorm(n,10,1), x3=rnorm(n,10,1),
x4=rnorm(n,10,1), x5=rnorm(n,10,1), x6=rnorm(n,10,1))
df4 <- arrange(df4,groupingvar)
regs <- df4 %>% group_by(groupingvar) %>%
do(fit = lm(cbind(y1,y2,y3) ~ . -groupingvar, data = .))
coeffs <- tidy(regs, fit)
假设我们有一个数据框,其中包含一组由分组变量标记的 3 个因变量和 6 个自变量。使用以下示例代码生成此格式的示例:
library(tidyverse)
library(broom)
n <- 15
df <- data.frame(groupingvar= sample(letters[1:2], size = n, replace = TRUE),
y1 = rnorm(n,10,1), y2=rnorm(n,100,10), y3=rnorm(n,1000,100),
x1= rnorm(n,10,1), x2=rnorm(n,10,1), x3=rnorm(n,10,1),
x4=rnorm(n,10,1), x5=rnorm(n,10,1), x6=rnorm(n,10,1))
df <- arrange(df,groupingvar)
如果我想对 x1 到 x6 的集合中的每个 y1、y2、y3 进行回归,我可以使用类似以下内容的方法:
y <- as.matrix(select(df,y1:y3))
x <- as.matrix(select(df,x1:x6))
regs <-lm(y~x)
coeffs <- tidy(regs)
coeffs <- arrange(coeffs,response, term)
(通过使用 lm() 帮助中的以下行:"If response is a matrix, a linear model is fitted separately by least-squares to each column of the matrix.")
但是,如果我需要先按分组变量进行分组,然后再应用 lm 函数,那么我不太确定该怎么做。我尝试了以下方法,但它为两组生成了相同的系数集。
regs2 <- df %>% group_by(groupingvar) %>%
do(fit2 = lm(as.matrix(select(df,y1:y3)) ~ as.matrix(select(df,x1:x6))))
coeffs2 <- tidy(regs2,fit2)
coeffs2 <- arrange(coeffs2,groupingvar, response)
在 data.table
中,您可以 melt
(重塑 long -- 将结果变量堆叠在一列中而不是存储在三列中)& lm
groupingvar
和结果变量:
library(data.table)
setDT(df)
#alternatively, set id.vars = c('groupingvar', paste0('x', 1:6)), etc.
longDT = melt(df, id.vars = grep('y', names(df), invert = TRUE))
#this helper function basically splits a named vector into
# its two components
coefsplit = function(reg) {
beta = coef(reg)
list(var = names(beta), coef = beta)
}
#I personally wouldn't assign longDT, I'd just chain this onto
# the output of melt;
longDT[ , coefsplit(lm(value ~ ., data = .SD)), by = .(groupingvar, variable)]
# groupingvar variable var coef
# 1: a y1 (Intercept) -3.595564e+03
# 2: a y1 x1 -3.796627e+01
# 3: a y1 x2 -1.557268e+02
# 4: a y1 x3 2.862738e+02
# 5: a y1 x4 1.579548e+02
# ...
# 38: b y3 x2 2.136253e+01
# 39: b y3 x3 -3.810176e+01
# 40: b y3 x4 4.187719e+01
# 41: b y3 x5 -2.586184e+02
# 42: b y3 x6 1.181879e+02
# groupingvar variable var coef
我还找到了一种使用 cbind() 实现此目的的方法,如下所示:
library(tidyverse)
library(broom)
n <- 20
df4 <- data.frame(groupingvar= sample(1:2, size = n, replace = TRUE),
y1 = rnorm(n,10,1), y2=rnorm(n,100,10), y3=rnorm(n,1000,100),
x1= rnorm(n,10,1), x2=rnorm(n,10,1), x3=rnorm(n,10,1),
x4=rnorm(n,10,1), x5=rnorm(n,10,1), x6=rnorm(n,10,1))
df4 <- arrange(df4,groupingvar)
regs <- df4 %>% group_by(groupingvar) %>%
do(fit = lm(cbind(y1,y2,y3) ~ . -groupingvar, data = .))
coeffs <- tidy(regs, fit)