将回归模型拟合到多个自变量和因变量,并通过分组变量获得单独的拟合

Fit regression model to multiple independent and dependent variables and obtain separate fits by grouping variable

让我们再试一次....

我想使用 mtcars 数据集将非线性回归模型拟合到使用同一模型的多个因变量和自变量。假设我想使用变量 disp、hp 和 wt 来解释 mpg 和 drat。拟合模型后,我想计算总平方和和残差平方和并将它们存储在矩阵中。这可以通过以下方式长期完成...

dt <- data.frame(mtcars)
m1 <- nls(mpg ~ B0*(disp^B1)*exp(B2*disp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m2 <- nls(mpg ~ B0*(hp^B1)*exp(B2*hp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m3 <- nls(mpg ~ B0*(wt^B1)*exp(B2*wt), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m4 <- nls(drat ~ B0*(disp^B1)*exp(B2*disp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m5 <- nls(drat ~ B0*(hp^B1)*exp(B2*hp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m6 <- nls(drat ~ B0*(wt^B1)*exp(B2*wt), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
TSS.mpg <- sum((dt$mpg - mean(dt$mpg))^2)
TSS.drat <- sum((dt$drat - mean(dt$drat))^2)
RSS.m1 <- sum(residuals(m1)^2)
RSS.m2 <- sum(residuals(m2)^2)
RSS.m3 <- sum(residuals(m3)^2)
RSS.m4 <- sum(residuals(m4)^2)
RSS.m5 <- sum(residuals(m5)^2)
RSS.m6 <- sum(residuals(m6)^2)

sumsqu <- matrix(0,6,2)
sumsqu[1:3,1] <- TSS.mpg
sumsqu[4:6,1] <- TSS.drat
sumsqu[1,2] <- RSS.m1
sumsqu[2,2] <- RSS.m2
sumsqu[3,2] <- RSS.m3
sumsqu[4,2] <- RSS.m4
sumsqu[5,2] <- RSS.m5
sumsqu[6,2] <- RSS.m6

所以,最后的结果是一个矩阵,第1列是总平方和,第2列是残差平方和。现在,让我们通过包含分组因素来使其变得更加复杂。我想做相同的模型拟合和 SS 提取,但是对于基于变量 "am" 的两组,其中 am=0 或 1。最终结果将是一个类似于第 1 部分但有四列的矩阵, am=0 的前 2 列和 am=1 的后 2 列。同样,这可以通过以下方式长期完成...

#subset the data (am = 0) and refit models
dt0 <- subset(dt, am == 0)
m1.0 <- nls(mpg ~ B0*(disp^B1)*exp(B2*disp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m2.0 <- nls(mpg ~ B0*(hp^B1)*exp(B2*hp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m3.0 <- nls(mpg ~ B0*(wt^B1)*exp(B2*wt), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m4.0 <- nls(drat ~ B0*(disp^B1)*exp(B2*disp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m5.0 <- nls(drat ~ B0*(hp^B1)*exp(B2*hp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m6.0 <- nls(drat ~ B0*(wt^B1)*exp(B2*wt), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
TSS.mpg.0 <- sum((dt0$mpg - mean(dt0$mpg))^2)
TSS.drat.0 <- sum((dt0$drat - mean(dt0$drat))^2)
RSS.m1.0 <- sum(residuals(m1.0)^2)
RSS.m2.0 <- sum(residuals(m2.0)^2)
RSS.m3.0 <- sum(residuals(m3.0)^2)
RSS.m4.0 <- sum(residuals(m4.0)^2)
RSS.m5.0 <- sum(residuals(m5.0)^2)
RSS.m6.0 <- sum(residuals(m6.0)^2)

sumsqu.0 <- matrix(0,6,2)
sumsqu.0[1:3,1] <- TSS.mpg.0
sumsqu.0[4:6,1] <- TSS.drat.0
sumsqu.0[1,2] <- RSS.m1.0
sumsqu.0[2,2] <- RSS.m2.0
sumsqu.0[3,2] <- RSS.m3.0
sumsqu.0[4,2] <- RSS.m4.0
sumsqu.0[5,2] <- RSS.m5.0
sumsqu.0[6,2] <- RSS.m6.0

#subset the data (am=1) and refit models
dt1 <- subset(dt, am == 1)
m1.1 <- nls(mpg ~ B0*(disp^B1)*exp(B2*disp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m2.1 <- nls(mpg ~ B0*(hp^B1)*exp(B2*hp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m3.1 <- nls(mpg ~ B0*(wt^B1)*exp(B2*wt), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m4.1 <- nls(drat ~ B0*(disp^B1)*exp(B2*disp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m5.1 <- nls(drat ~ B0*(hp^B1)*exp(B2*hp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m6.1 <- nls(drat ~ B0*(wt^B1)*exp(B2*wt), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
TSS.mpg.1 <- sum((dt1$mpg - mean(dt1$mpg))^2)
TSS.drat.1 <- sum((dt1$drat - mean(dt1$drat))^2)
RSS.m1.1 <- sum(residuals(m1.1)^2)
RSS.m2.1 <- sum(residuals(m2.1)^2)
RSS.m3.1 <- sum(residuals(m3.1)^2)
RSS.m4.1 <- sum(residuals(m4.1)^2)
RSS.m5.1 <- sum(residuals(m5.1)^2)
RSS.m6.1 <- sum(residuals(m6.1)^2)

sumsqu.1 <- matrix(0,6,2)
sumsqu.1[1:3,1] <- TSS.mpg.1
sumsqu.1[4:6,1] <- TSS.drat.1
sumsqu.1[1,2] <- RSS.m1.1
sumsqu.1[2,2] <- RSS.m2.1
sumsqu.1[3,2] <- RSS.m3.1
sumsqu.1[4,2] <- RSS.m4.1
sumsqu.1[5,2] <- RSS.m5.1
sumsqu.1[6,2] <- RSS.m6.1

#combine sumsqu.1 and sumsqu.0
allSS <- cbind(sumsqu.0,sumsqu.1)
allSS

如您所见,这个过程变得相当冗长,我知道该怎么做。现在想象一下我的真实问题有 6 个因变量、7 个自变量、5 个组,并从每个拟合中提取 10 个左右的数字。从我的代码中你可以看出我不是程序员,因为我的方法是高度 inefficient.I 认为我可以包含某种功能然后使用一些应用功能,例如..

nls1 <- function(x,y){
m1 <- nls( y ~ B0*(x^B1)*exp(B2*x), data=dt0, start=c(B0 = 3.5, B1 = 0.2, B2 = 0.0007))
RSS <- sum(residuals(m1)^2)
TSS <- sum((y - mean(y))^2)
RSS
TSS
}

非常感谢任何帮助提高此过程效率的帮助。

这里我使用了 2 个因变量(drat、mpg)、3 个自变量(disp、hp、wt)和 1 个分组变量 2 levels/classes(am 为 1/0)。

library(dplyr)
library(tidyr)

# example dataset (picking useful columns)
dt <- data.frame(mtcars) %>% select(drat, mpg, disp, hp, wt, am)

# specify which columns we want as y (dependent) and x (independent)
# grouping variable is specified within the dependent variables
ynames <- c("drat","mpg","am")
xnames <- c("disp","hp","wt")

# create and reshape datasets
dt1 <- dt[,ynames]
dt1 <- gather(dt1, y, yvalue, -am)

dt2 <- dt[,xnames]
dt2 <- gather(dt2, x, xvalue)


dt1 %>% 
  group_by(y) %>%                 
  do(data.frame(.,dt2)) %>%
  group_by(y,x,am) %>% 
  do({ m1 <- nls( yvalue ~ B0*(xvalue^B1)*exp(B2*xvalue), data=., start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
  RSS <- sum(residuals(m1)^2)
  TSS <- sum((.$yvalue - mean(.$yvalue))^2)
  data.frame(RSS,TSS) })

#       y    x am         RSS        TSS
# 1  drat disp  0   1.3090406   2.770242
# 2  drat disp  1   1.1155372   1.590400
# 3  drat   hp  0   2.1707337   2.770242
# 4  drat   hp  1   0.8342527   1.590400
# 5  drat   wt  0   2.2100162   2.770242
# 6  drat   wt  1   1.1885811   1.590400
# 7   mpg disp  0  98.4815286 264.587368
# 8   mpg disp  1  46.8674036 456.309231
# 9   mpg   hp  0  74.9295161 264.587368
# 10  mpg   hp  1 112.5548955 456.309231
# 11  mpg   wt  0 104.2894519 264.587368
# 12  mpg   wt  1  71.1402536 456.309231

如您所见,上面的方法重塑了数据并创建了一个更大的数据集,其中包含所有需要的 y 和 x 变量组合。如果你最终拥有一个庞大的数据集,你可能会遇到问题。或者可能遇到类似问题的其他人需要处理长度较大的变量并创建该大数据集会产生问题。

最好为每个模型拟合创建我们需要的公式,而不是创建变量组合。这种方法类似于@BondedDust 在下面建议的方法。

library(dplyr)

# example dataset (picking useful columns)
dt <- data.frame(mtcars) %>% select(drat, mpg, disp, hp, wt, am)

# specify which columns we want as y (dependent) and x (independent)
ynames <- c("drat","mpg")
xnames <- c("disp","hp","wt")

# get unique values of the grouping variable am
groupvalues = unique(dt$am)


expand.grid(ynames,xnames,groupvalues) %>%
  data.frame() %>%
  select(y=Var1, x=Var2, group=Var3) %>%
  mutate(formula = paste0(y," ~ B0*(",x,"^B1)*exp(B2*",x,")")) %>%
  group_by(y,x,group,formula) %>%
  do({ m1 <- nls( .$formula, data=dt[dt$am==.$group,], 
                  start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
  RSS <- sum(residuals(m1)^2)
  TSS <- sum((dt[dt$am==.$group,][,.$y]- mean(dt[dt$am==.$group,][,.$y]))^2)
  data.frame(RSS,TSS) })


#       y    x group                          formula         RSS        TSS
# 1  drat disp     0 drat ~ B0*(disp^B1)*exp(B2*disp)   1.3090406   2.770242
# 2  drat disp     1 drat ~ B0*(disp^B1)*exp(B2*disp)   1.1155372   1.590400
# 3  drat   hp     0     drat ~ B0*(hp^B1)*exp(B2*hp)   2.1707337   2.770242
# 4  drat   hp     1     drat ~ B0*(hp^B1)*exp(B2*hp)   0.8342527   1.590400
# 5  drat   wt     0     drat ~ B0*(wt^B1)*exp(B2*wt)   2.2100162   2.770242
# 6  drat   wt     1     drat ~ B0*(wt^B1)*exp(B2*wt)   1.1885811   1.590400
# 7   mpg disp     0  mpg ~ B0*(disp^B1)*exp(B2*disp)  98.4815286 264.587368
# 8   mpg disp     1  mpg ~ B0*(disp^B1)*exp(B2*disp)  46.8674036 456.309231
# 9   mpg   hp     0      mpg ~ B0*(hp^B1)*exp(B2*hp)  74.9295161 264.587368
# 10  mpg   hp     1      mpg ~ B0*(hp^B1)*exp(B2*hp) 112.5548955 456.309231
# 11  mpg   wt     0      mpg ~ B0*(wt^B1)*exp(B2*wt) 104.2894519 264.587368
# 12  mpg   wt     1      mpg ~ B0*(wt^B1)*exp(B2*wt)  71.1402536 456.309231

您可以尝试类似的方法:

vars <- expand.grid( Y = c('a1','a2','a3'), X=c('b1','b2','b3','b4'))
models_list <-  lapply( apply(vars, 1, 
                                 function(x) as.formula(paste(x[1], x[2], sep= "~") ) ),   
                        function(form) summary(lm(form=form, data= your_df) )
                        )

aov 代替 lm 可能会给您带来更多您喜欢的东西。请参阅 ?summary.aov 并尝试其中的示例,以了解无论您的最终目的是什么,都需要哪些组件。