将 lm 的系数制表
Tabulate coefficients from lm
我有 10 个线性模型,我只需要一些信息,即:r 平方、p 值、斜率和截距系数。我设法提取了这些值(通过可笑地重复代码)。现在,我需要将这些值制成表格(列中的信息;列出线性模型 1-10 的结果的行)。谁能帮帮我吗?我还有数百个线性模型要做。我相信一定有办法。
代码:
d<-read.csv("example.csv",header=T)
# Subset data
A3G1 <- subset(d, CatChro=="A3G1"); A4G1 <- subset(d, CatChro=="A4G1")
A3G2 <- subset(d, CatChro=="A3G2"); A4G2 <- subset(d, CatChro=="A4G2")
A3G3 <- subset(d, CatChro=="A3G3"); A4G3 <- subset(d, CatChro=="A4G3")
A3G4 <- subset(d, CatChro=="A3G4"); A4G4 <- subset(d, CatChro=="A4G4")
A3G5 <- subset(d, CatChro=="A3G5"); A4G5 <- subset(d, CatChro=="A4G5")
A3D1 <- subset(d, CatChro=="A3D1"); A4D1 <- subset(d, CatChro=="A4D1")
A3D2 <- subset(d, CatChro=="A3D2"); A4D2 <- subset(d, CatChro=="A4D2")
A3D3 <- subset(d, CatChro=="A3D3"); A4D3 <- subset(d, CatChro=="A4D3")
A3D4 <- subset(d, CatChro=="A3D4"); A4D4 <- subset(d, CatChro=="A4D4")
A3D5 <- subset(d, CatChro=="A3D5"); A4D5 <- subset(d, CatChro=="A4D5")
# Fit individual lines
rA3G1 <- lm(Qend~Rainfall, data=A3G1); summary(rA3G1)
rA3D1 <- lm(Qend~Rainfall, data=A3D1); summary(rA3D1)
rA3G2 <- lm(Qend~Rainfall, data=A3G2); summary(rA3G2)
rA3D2 <- lm(Qend~Rainfall, data=A3D2); summary(rA3D2)
rA3G3 <- lm(Qend~Rainfall, data=A3G3); summary(rA3G3)
rA3D3 <- lm(Qend~Rainfall, data=A3D3); summary(rA3D3)
rA3G4 <- lm(Qend~Rainfall, data=A3G4); summary(rA3G4)
rA3D4 <- lm(Qend~Rainfall, data=A3D4); summary(rA3D4)
rA3G5 <- lm(Qend~Rainfall, data=A3G5); summary(rA3G5)
rA3D5 <- lm(Qend~Rainfall, data=A3D5); summary(rA3D5)
rA4G1 <- lm(Qend~Rainfall, data=A4G1); summary(rA4G1)
rA4D1 <- lm(Qend~Rainfall, data=A4D1); summary(rA4D1)
rA4G2 <- lm(Qend~Rainfall, data=A4G2); summary(rA4G2)
rA4D2 <- lm(Qend~Rainfall, data=A4D2); summary(rA4D2)
rA4G3 <- lm(Qend~Rainfall, data=A4G3); summary(rA4G3)
rA4D3 <- lm(Qend~Rainfall, data=A4D3); summary(rA4D3)
rA4G4 <- lm(Qend~Rainfall, data=A4G4); summary(rA4G4)
rA4D4 <- lm(Qend~Rainfall, data=A4D4); summary(rA4D4)
rA4G5 <- lm(Qend~Rainfall, data=A4G5); summary(rA4G5)
rA4D5 <- lm(Qend~Rainfall, data=A4D5); summary(rA4D5)
# Gradient
summary(rA3G1)$coefficients[2,1]
summary(rA3D1)$coefficients[2,1]
summary(rA3G2)$coefficients[2,1]
summary(rA3D2)$coefficients[2,1]
summary(rA3G3)$coefficients[2,1]
summary(rA3D3)$coefficients[2,1]
summary(rA3G4)$coefficients[2,1]
summary(rA3D4)$coefficients[2,1]
summary(rA3G5)$coefficients[2,1]
summary(rA3D5)$coefficients[2,1]
# Intercept
summary(rA3G1)$coefficients[2,2]
summary(rA3D1)$coefficients[2,2]
summary(rA3G2)$coefficients[2,2]
summary(rA3D2)$coefficients[2,2]
summary(rA3G3)$coefficients[2,2]
summary(rA3D3)$coefficients[2,2]
summary(rA3G4)$coefficients[2,2]
summary(rA3D4)$coefficients[2,2]
summary(rA3G5)$coefficients[2,2]
summary(rA3D5)$coefficients[2,2]
# r-sq
summary(rA3G1)$r.squared
summary(rA3D1)$r.squared
summary(rA3G2)$r.squared
summary(rA3D2)$r.squared
summary(rA3G3)$r.squared
summary(rA3D3)$r.squared
summary(rA3G4)$r.squared
summary(rA3D4)$r.squared
summary(rA3G5)$r.squared
summary(rA3D5)$r.squared
# adj r-sq
summary(rA3G1)$adj.r.squared
summary(rA3D1)$adj.r.squared
summary(rA3G2)$adj.r.squared
summary(rA3D2)$adj.r.squared
summary(rA3G3)$adj.r.squared
summary(rA3D3)$adj.r.squared
summary(rA3G4)$adj.r.squared
summary(rA3D4)$adj.r.squared
summary(rA3G5)$adj.r.squared
summary(rA3D5)$adj.r.squared
# p-level
p <- summary(rA3G1)$fstatistic
pf(p[1], p[2], p[3], lower.tail=FALSE)
p2 <- summary(rA3D1)$fstatistic
pf(p2[1], p2[2], p2[3], lower.tail=FALSE)
p3 <- summary(rA3G2)$fstatistic
pf(p3[1], p3[2], p3[3], lower.tail=FALSE)
p4 <- summary(rA3D2)$fstatistic
pf(p4[1], p4[2], p4[3], lower.tail=FALSE)
p5 <- summary(rA3G3)$fstatistic
pf(p5[1], p5[2], p5[3], lower.tail=FALSE)
p6 <- summary(rA3D3)$fstatistic
pf(p6[1], p6[2], p6[3], lower.tail=FALSE)
p7 <- summary(rA3G4)$fstatistic
pf(p7[1], p7[2], p7[3], lower.tail=FALSE)
p8 <- summary(rA3D4)$fstatistic
pf(p8[1], p8[2], p8[3], lower.tail=FALSE)
p9 <- summary(rA3G5)$fstatistic
pf(p9[1], p9[2], p9[3], lower.tail=FALSE)
p10 <- summary(rA3D5)$fstatistic
pf(p10[1], p10[2], p10[3], lower.tail=FALSE)
这是我预期结果的结构:
有什么办法可以实现吗?
使用library(data.table)
你可以做到
d <- fread("example.csv")
d[, .(
r2 = (fit <- summary(lm(Qend~Rainfall)))$r.squared,
adj.r2 = fit$adj.r.squared,
intercept = fit$coefficients[1,1],
gradient = fit$coefficients[2,1],
p.value = {p <- fit$fstatistic; pf(p[1], p[2], p[3], lower.tail=FALSE)}),
by = CatChro]
# CatChro r2 adj.r2 intercept gradient p.value
# 1: A3G1 0.03627553 0.011564648 0.024432020 0.0001147645 0.2329519751
# 2: A3D1 0.28069553 0.254054622 0.011876543 0.0004085644 0.0031181110
# 3: A3G2 0.06449971 0.041112205 0.026079409 0.0004583538 0.1045970987
# 4: A3D2 0.03384173 0.005425311 0.023601325 0.0005431693 0.2828170556
# 5: A3G3 0.07587433 0.054383038 0.043537869 0.0006964512 0.0670399684
# 6: A3D3 0.04285322 0.002972105 0.022106960 0.0001451185 0.3102578215
# 7: A3G4 0.17337420 0.155404076 0.023706652 0.0006442175 0.0032431299
# 8: A3D4 0.37219027 0.349768492 0.009301843 0.0006614213 0.0003442445
# 9: A3G5 0.17227383 0.150491566 0.025994831 0.0006658466 0.0077413595
#10: A3D5 0.04411669 -0.008987936 0.014341399 0.0001084626 0.3741011769
考虑构建一个 lm
结果矩阵。首先创建一个定义的函数来处理带有结果提取的广义数据框模型构建。然后,调用 by
可以按因子列对数据框进行子集化,并将每个子集传递给定义的方法。最后,rbind
将所有分组矩阵组合在一起以获得单一输出
lm_results <- function(df) {
model <- lm(Qend ~ Rainfall, data = df)
res <- summary(model)
p <- res$fstatistic
c(gradient = res$coefficients[2,1],
intercept = res$coefficients[2,2],
r_sq = res$r.squared,
adj_r_sq = res$adj.r.squared,
f_stat = p[['value']],
p_value = unname(pf(p[1], p[2], p[3], lower.tail=FALSE))
)
}
matrix_list <- by(d, d$group, lm_results)
final_matrix <- do.call(rbind, matrix_list)
在随机的种子数据上进行演示
set.seed(12262018)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
d <- data.frame(
group = sample(data_tools, 500, replace=TRUE),
int = sample(1:15, 500, replace=TRUE),
Qend = rnorm(500) / 100,
Rainfall = rnorm(500) * 10
)
结果
mat_list <- by(d, d$group, lm_results)
final_matrix <- do.call(rbind, mat_list)
final_matrix
# gradient intercept r_sq adj_r_sq f_stat p_value
# julia -1.407313e-04 1.203832e-04 0.017219149 0.004619395 1.3666258 0.24595273
# python -1.438116e-04 1.125170e-04 0.018641512 0.007230367 1.6336233 0.20464162
# r 2.031717e-04 1.168037e-04 0.041432175 0.027738349 3.0256098 0.08635510
# sas -1.549510e-04 9.067337e-05 0.032476668 0.021355710 2.9203121 0.09103619
# spss 9.326656e-05 1.068516e-04 0.008583473 -0.002682623 0.7618853 0.38511469
# stata -7.079514e-05 1.024010e-04 0.006013841 -0.006568262 0.4779679 0.49137093
这是一个基本的 R 解决方案:
data <- read.csv("./data/so53933238.csv",header=TRUE)
# split by value of CatChro into a list of datasets
dataList <- split(data,data$CatChro)
# process the list with lm(), extract results to a data frame, write to a list
lmResults <- lapply(dataList,function(x){
y <- summary(lm(Qend ~ Rainfall,data = x))
Intercept <- y$coefficients[1,1]
Slope <- y$coefficients[2,1]
rSquared <- y$r.squared
adjRSquared <- y$adj.r.squared
f <- y$fstatistic[1]
pValue <- pf(y$fstatistic[1],y$fstatistic[2],y$fstatistic[3],lower.tail = FALSE)
data.frame(Slope,Intercept,rSquared,adjRSquared,pValue)
})
lmResultTable <- do.call(rbind,lmResults)
# add CatChro indicators
lmResultTable$catChro <- names(dataList)
lmResultTable
...输出:
> lmResultTable
Slope Intercept rSquared adjRSquared pValue catChro
A3D1 0.0004085644 0.011876543 0.28069553 0.254054622 0.0031181110 A3D1
A3D2 0.0005431693 0.023601325 0.03384173 0.005425311 0.2828170556 A3D2
A3D3 0.0001451185 0.022106960 0.04285322 0.002972105 0.3102578215 A3D3
A3D4 0.0006614213 0.009301843 0.37219027 0.349768492 0.0003442445 A3D4
A3D5 0.0001084626 0.014341399 0.04411669 -0.008987936 0.3741011769 A3D5
A3G1 0.0001147645 0.024432020 0.03627553 0.011564648 0.2329519751 A3G1
A3G2 0.0004583538 0.026079409 0.06449971 0.041112205 0.1045970987 A3G2
A3G3 0.0006964512 0.043537869 0.07587433 0.054383038 0.0670399684 A3G3
A3G4 0.0006442175 0.023706652 0.17337420 0.155404076 0.0032431299 A3G4
A3G5 0.0006658466 0.025994831 0.17227383 0.150491566 0.0077413595 A3G5
>
要在 HTML 中以表格格式呈现输出,可以使用 knitr::kable()
。
library(knitr)
kable(lmResultTable[1:5],row.names=TRUE,digits=5)
...在渲染 Markdown 后产生以下输出:
这里只有几行:
library(tidyverse)
library(broom)
# create grouped dataframe:
df_g <- df %>% group_by(CatChro)
df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>%
select(CatChro, term, estimate) %>% spread(term, estimate) %>%
left_join(df_g %>% do(glance(lm(Qend ~ Rainfall, data = .))) %>%
select(CatChro, r.squared, adj.r.squared, p.value), by = "CatChro")
结果将是:
# A tibble: 10 x 6
# Groups: CatChro [?]
CatChro `(Intercept)` Rainfall r.squared adj.r.squared p.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A3D1 0.0119 0.000409 0.281 0.254 0.00312
2 A3D2 0.0236 0.000543 0.0338 0.00543 0.283
3 A3D3 0.0221 0.000145 0.0429 0.00297 0.310
4 A3D4 0.00930 0.000661 0.372 0.350 0.000344
5 A3D5 0.0143 0.000108 0.0441 -0.00899 0.374
6 A3G1 0.0244 0.000115 0.0363 0.0116 0.233
7 A3G2 0.0261 0.000458 0.0645 0.0411 0.105
8 A3G3 0.0435 0.000696 0.0759 0.0544 0.0670
9 A3G4 0.0237 0.000644 0.173 0.155 0.00324
10 A3G5 0.0260 0.000666 0.172 0.150 0.00774
那么,这是如何工作的?
下面创建一个包含所有系数和相应统计信息的数据帧(tidy 将 lm 的结果转换为数据帧):
df_g %>%
do(tidy(lm(Qend ~ Rainfall, data = .)))
A tibble: 20 x 6
Groups: CatChro [10]
CatChro term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A3D1 (Intercept) 0.0119 0.00358 3.32 0.00258
2 A3D1 Rainfall 0.000409 0.000126 3.25 0.00312
3 A3D2 (Intercept) 0.0236 0.00928 2.54 0.0157
4 A3D2 Rainfall 0.000543 0.000498 1.09 0.283
我知道您想将 Rainfall 的截距和系数作为单独的列,所以让我们 "spread" 把它们拿出来。这是通过首先选择相关列,然后调用 tidyr::spread
来实现的,如
select(CatChro, term, estimate) %>% spread(term, estimate)
这给你:
df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>%
select(CatChro, term, estimate) %>% spread(term, estimate)
A tibble: 10 x 3
Groups: CatChro [10]
CatChro `(Intercept)` Rainfall
<chr> <dbl> <dbl>
1 A3D1 0.0119 0.000409
2 A3D2 0.0236 0.000543
3 A3D3 0.0221 0.000145
4 A3D4 0.00930 0.000661
Glance 为您提供了您正在寻找的每个模型的汇总统计数据。这些模型按组索引,这里是 CatChro,因此很容易将它们合并到前面的数据帧中,这就是其余代码的内容。
另一个解决方案,lme4::lmList
。由 lmList
生成的对象的 summary()
方法几乎可以满足您的所有需求(尽管它不存储 p 值,但这是我必须在下面添加的内容)。
m <- lme4::lmList(Qend~Rainfall|CatChro,data=d)
s <- summary(m)
pvals <- apply(s$fstatistic,1,function(x) pf(x[1],x[2],x[3],lower.tail=FALSE))
data.frame(intercept=coef(s)[,"Estimate","(Intercept)"],
slope=coef(s)[,"Estimate","Rainfall"],
r.squared=s$r.squared,
adj.r.squared=unlist(s$adj.r.squared),
p.value=pvals)
我有 10 个线性模型,我只需要一些信息,即:r 平方、p 值、斜率和截距系数。我设法提取了这些值(通过可笑地重复代码)。现在,我需要将这些值制成表格(列中的信息;列出线性模型 1-10 的结果的行)。谁能帮帮我吗?我还有数百个线性模型要做。我相信一定有办法。
代码:
d<-read.csv("example.csv",header=T)
# Subset data
A3G1 <- subset(d, CatChro=="A3G1"); A4G1 <- subset(d, CatChro=="A4G1")
A3G2 <- subset(d, CatChro=="A3G2"); A4G2 <- subset(d, CatChro=="A4G2")
A3G3 <- subset(d, CatChro=="A3G3"); A4G3 <- subset(d, CatChro=="A4G3")
A3G4 <- subset(d, CatChro=="A3G4"); A4G4 <- subset(d, CatChro=="A4G4")
A3G5 <- subset(d, CatChro=="A3G5"); A4G5 <- subset(d, CatChro=="A4G5")
A3D1 <- subset(d, CatChro=="A3D1"); A4D1 <- subset(d, CatChro=="A4D1")
A3D2 <- subset(d, CatChro=="A3D2"); A4D2 <- subset(d, CatChro=="A4D2")
A3D3 <- subset(d, CatChro=="A3D3"); A4D3 <- subset(d, CatChro=="A4D3")
A3D4 <- subset(d, CatChro=="A3D4"); A4D4 <- subset(d, CatChro=="A4D4")
A3D5 <- subset(d, CatChro=="A3D5"); A4D5 <- subset(d, CatChro=="A4D5")
# Fit individual lines
rA3G1 <- lm(Qend~Rainfall, data=A3G1); summary(rA3G1)
rA3D1 <- lm(Qend~Rainfall, data=A3D1); summary(rA3D1)
rA3G2 <- lm(Qend~Rainfall, data=A3G2); summary(rA3G2)
rA3D2 <- lm(Qend~Rainfall, data=A3D2); summary(rA3D2)
rA3G3 <- lm(Qend~Rainfall, data=A3G3); summary(rA3G3)
rA3D3 <- lm(Qend~Rainfall, data=A3D3); summary(rA3D3)
rA3G4 <- lm(Qend~Rainfall, data=A3G4); summary(rA3G4)
rA3D4 <- lm(Qend~Rainfall, data=A3D4); summary(rA3D4)
rA3G5 <- lm(Qend~Rainfall, data=A3G5); summary(rA3G5)
rA3D5 <- lm(Qend~Rainfall, data=A3D5); summary(rA3D5)
rA4G1 <- lm(Qend~Rainfall, data=A4G1); summary(rA4G1)
rA4D1 <- lm(Qend~Rainfall, data=A4D1); summary(rA4D1)
rA4G2 <- lm(Qend~Rainfall, data=A4G2); summary(rA4G2)
rA4D2 <- lm(Qend~Rainfall, data=A4D2); summary(rA4D2)
rA4G3 <- lm(Qend~Rainfall, data=A4G3); summary(rA4G3)
rA4D3 <- lm(Qend~Rainfall, data=A4D3); summary(rA4D3)
rA4G4 <- lm(Qend~Rainfall, data=A4G4); summary(rA4G4)
rA4D4 <- lm(Qend~Rainfall, data=A4D4); summary(rA4D4)
rA4G5 <- lm(Qend~Rainfall, data=A4G5); summary(rA4G5)
rA4D5 <- lm(Qend~Rainfall, data=A4D5); summary(rA4D5)
# Gradient
summary(rA3G1)$coefficients[2,1]
summary(rA3D1)$coefficients[2,1]
summary(rA3G2)$coefficients[2,1]
summary(rA3D2)$coefficients[2,1]
summary(rA3G3)$coefficients[2,1]
summary(rA3D3)$coefficients[2,1]
summary(rA3G4)$coefficients[2,1]
summary(rA3D4)$coefficients[2,1]
summary(rA3G5)$coefficients[2,1]
summary(rA3D5)$coefficients[2,1]
# Intercept
summary(rA3G1)$coefficients[2,2]
summary(rA3D1)$coefficients[2,2]
summary(rA3G2)$coefficients[2,2]
summary(rA3D2)$coefficients[2,2]
summary(rA3G3)$coefficients[2,2]
summary(rA3D3)$coefficients[2,2]
summary(rA3G4)$coefficients[2,2]
summary(rA3D4)$coefficients[2,2]
summary(rA3G5)$coefficients[2,2]
summary(rA3D5)$coefficients[2,2]
# r-sq
summary(rA3G1)$r.squared
summary(rA3D1)$r.squared
summary(rA3G2)$r.squared
summary(rA3D2)$r.squared
summary(rA3G3)$r.squared
summary(rA3D3)$r.squared
summary(rA3G4)$r.squared
summary(rA3D4)$r.squared
summary(rA3G5)$r.squared
summary(rA3D5)$r.squared
# adj r-sq
summary(rA3G1)$adj.r.squared
summary(rA3D1)$adj.r.squared
summary(rA3G2)$adj.r.squared
summary(rA3D2)$adj.r.squared
summary(rA3G3)$adj.r.squared
summary(rA3D3)$adj.r.squared
summary(rA3G4)$adj.r.squared
summary(rA3D4)$adj.r.squared
summary(rA3G5)$adj.r.squared
summary(rA3D5)$adj.r.squared
# p-level
p <- summary(rA3G1)$fstatistic
pf(p[1], p[2], p[3], lower.tail=FALSE)
p2 <- summary(rA3D1)$fstatistic
pf(p2[1], p2[2], p2[3], lower.tail=FALSE)
p3 <- summary(rA3G2)$fstatistic
pf(p3[1], p3[2], p3[3], lower.tail=FALSE)
p4 <- summary(rA3D2)$fstatistic
pf(p4[1], p4[2], p4[3], lower.tail=FALSE)
p5 <- summary(rA3G3)$fstatistic
pf(p5[1], p5[2], p5[3], lower.tail=FALSE)
p6 <- summary(rA3D3)$fstatistic
pf(p6[1], p6[2], p6[3], lower.tail=FALSE)
p7 <- summary(rA3G4)$fstatistic
pf(p7[1], p7[2], p7[3], lower.tail=FALSE)
p8 <- summary(rA3D4)$fstatistic
pf(p8[1], p8[2], p8[3], lower.tail=FALSE)
p9 <- summary(rA3G5)$fstatistic
pf(p9[1], p9[2], p9[3], lower.tail=FALSE)
p10 <- summary(rA3D5)$fstatistic
pf(p10[1], p10[2], p10[3], lower.tail=FALSE)
这是我预期结果的结构:
有什么办法可以实现吗?
使用library(data.table)
你可以做到
d <- fread("example.csv")
d[, .(
r2 = (fit <- summary(lm(Qend~Rainfall)))$r.squared,
adj.r2 = fit$adj.r.squared,
intercept = fit$coefficients[1,1],
gradient = fit$coefficients[2,1],
p.value = {p <- fit$fstatistic; pf(p[1], p[2], p[3], lower.tail=FALSE)}),
by = CatChro]
# CatChro r2 adj.r2 intercept gradient p.value
# 1: A3G1 0.03627553 0.011564648 0.024432020 0.0001147645 0.2329519751
# 2: A3D1 0.28069553 0.254054622 0.011876543 0.0004085644 0.0031181110
# 3: A3G2 0.06449971 0.041112205 0.026079409 0.0004583538 0.1045970987
# 4: A3D2 0.03384173 0.005425311 0.023601325 0.0005431693 0.2828170556
# 5: A3G3 0.07587433 0.054383038 0.043537869 0.0006964512 0.0670399684
# 6: A3D3 0.04285322 0.002972105 0.022106960 0.0001451185 0.3102578215
# 7: A3G4 0.17337420 0.155404076 0.023706652 0.0006442175 0.0032431299
# 8: A3D4 0.37219027 0.349768492 0.009301843 0.0006614213 0.0003442445
# 9: A3G5 0.17227383 0.150491566 0.025994831 0.0006658466 0.0077413595
#10: A3D5 0.04411669 -0.008987936 0.014341399 0.0001084626 0.3741011769
考虑构建一个 lm
结果矩阵。首先创建一个定义的函数来处理带有结果提取的广义数据框模型构建。然后,调用 by
可以按因子列对数据框进行子集化,并将每个子集传递给定义的方法。最后,rbind
将所有分组矩阵组合在一起以获得单一输出
lm_results <- function(df) {
model <- lm(Qend ~ Rainfall, data = df)
res <- summary(model)
p <- res$fstatistic
c(gradient = res$coefficients[2,1],
intercept = res$coefficients[2,2],
r_sq = res$r.squared,
adj_r_sq = res$adj.r.squared,
f_stat = p[['value']],
p_value = unname(pf(p[1], p[2], p[3], lower.tail=FALSE))
)
}
matrix_list <- by(d, d$group, lm_results)
final_matrix <- do.call(rbind, matrix_list)
在随机的种子数据上进行演示
set.seed(12262018)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
d <- data.frame(
group = sample(data_tools, 500, replace=TRUE),
int = sample(1:15, 500, replace=TRUE),
Qend = rnorm(500) / 100,
Rainfall = rnorm(500) * 10
)
结果
mat_list <- by(d, d$group, lm_results)
final_matrix <- do.call(rbind, mat_list)
final_matrix
# gradient intercept r_sq adj_r_sq f_stat p_value
# julia -1.407313e-04 1.203832e-04 0.017219149 0.004619395 1.3666258 0.24595273
# python -1.438116e-04 1.125170e-04 0.018641512 0.007230367 1.6336233 0.20464162
# r 2.031717e-04 1.168037e-04 0.041432175 0.027738349 3.0256098 0.08635510
# sas -1.549510e-04 9.067337e-05 0.032476668 0.021355710 2.9203121 0.09103619
# spss 9.326656e-05 1.068516e-04 0.008583473 -0.002682623 0.7618853 0.38511469
# stata -7.079514e-05 1.024010e-04 0.006013841 -0.006568262 0.4779679 0.49137093
这是一个基本的 R 解决方案:
data <- read.csv("./data/so53933238.csv",header=TRUE)
# split by value of CatChro into a list of datasets
dataList <- split(data,data$CatChro)
# process the list with lm(), extract results to a data frame, write to a list
lmResults <- lapply(dataList,function(x){
y <- summary(lm(Qend ~ Rainfall,data = x))
Intercept <- y$coefficients[1,1]
Slope <- y$coefficients[2,1]
rSquared <- y$r.squared
adjRSquared <- y$adj.r.squared
f <- y$fstatistic[1]
pValue <- pf(y$fstatistic[1],y$fstatistic[2],y$fstatistic[3],lower.tail = FALSE)
data.frame(Slope,Intercept,rSquared,adjRSquared,pValue)
})
lmResultTable <- do.call(rbind,lmResults)
# add CatChro indicators
lmResultTable$catChro <- names(dataList)
lmResultTable
...输出:
> lmResultTable
Slope Intercept rSquared adjRSquared pValue catChro
A3D1 0.0004085644 0.011876543 0.28069553 0.254054622 0.0031181110 A3D1
A3D2 0.0005431693 0.023601325 0.03384173 0.005425311 0.2828170556 A3D2
A3D3 0.0001451185 0.022106960 0.04285322 0.002972105 0.3102578215 A3D3
A3D4 0.0006614213 0.009301843 0.37219027 0.349768492 0.0003442445 A3D4
A3D5 0.0001084626 0.014341399 0.04411669 -0.008987936 0.3741011769 A3D5
A3G1 0.0001147645 0.024432020 0.03627553 0.011564648 0.2329519751 A3G1
A3G2 0.0004583538 0.026079409 0.06449971 0.041112205 0.1045970987 A3G2
A3G3 0.0006964512 0.043537869 0.07587433 0.054383038 0.0670399684 A3G3
A3G4 0.0006442175 0.023706652 0.17337420 0.155404076 0.0032431299 A3G4
A3G5 0.0006658466 0.025994831 0.17227383 0.150491566 0.0077413595 A3G5
>
要在 HTML 中以表格格式呈现输出,可以使用 knitr::kable()
。
library(knitr)
kable(lmResultTable[1:5],row.names=TRUE,digits=5)
...在渲染 Markdown 后产生以下输出:
这里只有几行:
library(tidyverse)
library(broom)
# create grouped dataframe:
df_g <- df %>% group_by(CatChro)
df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>%
select(CatChro, term, estimate) %>% spread(term, estimate) %>%
left_join(df_g %>% do(glance(lm(Qend ~ Rainfall, data = .))) %>%
select(CatChro, r.squared, adj.r.squared, p.value), by = "CatChro")
结果将是:
# A tibble: 10 x 6
# Groups: CatChro [?]
CatChro `(Intercept)` Rainfall r.squared adj.r.squared p.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A3D1 0.0119 0.000409 0.281 0.254 0.00312
2 A3D2 0.0236 0.000543 0.0338 0.00543 0.283
3 A3D3 0.0221 0.000145 0.0429 0.00297 0.310
4 A3D4 0.00930 0.000661 0.372 0.350 0.000344
5 A3D5 0.0143 0.000108 0.0441 -0.00899 0.374
6 A3G1 0.0244 0.000115 0.0363 0.0116 0.233
7 A3G2 0.0261 0.000458 0.0645 0.0411 0.105
8 A3G3 0.0435 0.000696 0.0759 0.0544 0.0670
9 A3G4 0.0237 0.000644 0.173 0.155 0.00324
10 A3G5 0.0260 0.000666 0.172 0.150 0.00774
那么,这是如何工作的?
下面创建一个包含所有系数和相应统计信息的数据帧(tidy 将 lm 的结果转换为数据帧):
df_g %>%
do(tidy(lm(Qend ~ Rainfall, data = .)))
A tibble: 20 x 6
Groups: CatChro [10]
CatChro term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A3D1 (Intercept) 0.0119 0.00358 3.32 0.00258
2 A3D1 Rainfall 0.000409 0.000126 3.25 0.00312
3 A3D2 (Intercept) 0.0236 0.00928 2.54 0.0157
4 A3D2 Rainfall 0.000543 0.000498 1.09 0.283
我知道您想将 Rainfall 的截距和系数作为单独的列,所以让我们 "spread" 把它们拿出来。这是通过首先选择相关列,然后调用 tidyr::spread
来实现的,如
select(CatChro, term, estimate) %>% spread(term, estimate)
这给你:
df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>%
select(CatChro, term, estimate) %>% spread(term, estimate)
A tibble: 10 x 3
Groups: CatChro [10]
CatChro `(Intercept)` Rainfall
<chr> <dbl> <dbl>
1 A3D1 0.0119 0.000409
2 A3D2 0.0236 0.000543
3 A3D3 0.0221 0.000145
4 A3D4 0.00930 0.000661
Glance 为您提供了您正在寻找的每个模型的汇总统计数据。这些模型按组索引,这里是 CatChro,因此很容易将它们合并到前面的数据帧中,这就是其余代码的内容。
另一个解决方案,lme4::lmList
。由 lmList
生成的对象的 summary()
方法几乎可以满足您的所有需求(尽管它不存储 p 值,但这是我必须在下面添加的内容)。
m <- lme4::lmList(Qend~Rainfall|CatChro,data=d)
s <- summary(m)
pvals <- apply(s$fstatistic,1,function(x) pf(x[1],x[2],x[3],lower.tail=FALSE))
data.frame(intercept=coef(s)[,"Estimate","(Intercept)"],
slope=coef(s)[,"Estimate","Rainfall"],
r.squared=s$r.squared,
adj.r.squared=unlist(s$adj.r.squared),
p.value=pvals)