为什么 ggplot2 95%CI 和手动计算的预测 95%CI 不同?
why do ggplot2 95%CI and prediction 95%CI calculated manually differ?
我想知道为什么从线性混合效应模型计算 95% 置信带时 ggplot2 会产生比手动计算时更窄的带,例如在这里遵循 Ben Bolker 的方法 confidence intervals on predictions。也就是说,ggplot2 是否给出了不准确的模型表示?
这是一个使用 sleepstudy 数据集的可重现示例(修改为在结构上类似于我正在处理的 df):
data("sleepstudy") # load dataset
height <- seq(165, 185, length.out = 18) # create vector called height
Treatment <- rep(c("Control", "Drug"), 9) # create vector called treatment
Subject <- levels(sleepstudy$Subject) # get vector of Subject
ht.subject <- data.frame(height, Subject, Treatment)
sleepstudy <- dplyr::left_join(sleepstudy, ht.subject, by="Subject") # Append df so that each subject has its own height and treatment
sleepstudy$Treatment <- as.factor(sleepstudy$Treatment)
生成模型,将预测添加到原始 df,然后绘图
m.sleep <- lmer(Reaction ~ Treatment*height + (1 + Days|Subject), data=sleepstudy)
sleepstudy$pred <- predict(m.sleep)
ggplot(sleepstudy, aes(height, pred, col=Treatment)) + geom_smooth(method="lm")[2]
按照 Bolker 方法计算置信区间
newdf <- expand.grid(height=seq(165, 185, 1),
Treatment=c("Control","Drug"))
newdf$Reaction <- predict(m.sleep, newdf, re.form=NA)
modmat <- model.matrix(terms(m.sleep), newdf)
pvar1 <- diag(modmat %*% tcrossprod(vcov(m.sleep), modmat))
tvar1 <- pvar1+VarCorr(m.sleep)$Subject[1]
cmult <- 1.96
newdf <- data.frame(newdf
,plo = newdf$Reaction-cmult*sqrt(pvar1)
,phi = newdf$Reaction+cmult*sqrt(pvar1)
,tlo = newdf$Reaction-cmult*sqrt(tvar1)
,thi = newdf$Reaction+cmult*sqrt(tvar1))
# plot confidence intervals
ggplot(newdf, aes(x=height, y=Reaction, colour=Treatment)) +
geom_point() +
geom_ribbon(aes(ymin=plo, ymax=phi, fill=Treatment), alpha=0.4)[2]
经过一些调整,这似乎是一致的。置信区间确实更大,但不会大很多。请记住,ggplot 适合 非常 不同的模型;它通过忽略 (1) 重复测量和 (2) 日效应的处理来拟合单独的线性(非线性混合)模型。
拟合随机斜率但没有人口水平斜率(e.g.see here)的模型似乎很奇怪,所以我添加了Days
的固定效应:
m.sleep <- lmer(Reaction ~ Treatment*height + Days +
(1 + Days|Subject),
data=sleepstudy)
我稍微重新组织了绘图代码:
theme_set(theme_bw())
gg0 <- ggplot(sleepstudy, aes(height, colour=Treatment)) +
geom_point(aes(y=Reaction))+
geom_smooth(aes(y=pred), method="lm")
- 如果你想计算置信区间(这与
lm()
/ggplot2
所做的相当),那么你可能应该 而不是 添加VarCorr(m.sleep)$Subject[1]
到方差(来自 FAQ example 的 tvar1
变量用于创建 预测区间 而不是置信区间 ...)
- 因为我在上面的模型中有
Days
,所以我将 mean(sleepstudy$Days)
添加到预测数据框。
newdf <- expand.grid(height=seq(165, 185, 1),
Treatment=c("Control","Drug"),
Days=mean(sleepstudy$Days))
newdf$Reaction <- newdf$pred <- predict(m.sleep, newdf, re.form=NA)
modmat <- model.matrix(terms(m.sleep), newdf)
pvar1 <- diag(modmat %*% tcrossprod(vcov(m.sleep), modmat))
tvar1 <- pvar1
cmult <- 1.96
newdf <- data.frame(newdf
,plo = newdf$Reaction-cmult*sqrt(pvar1)
,phi = newdf$Reaction+cmult*sqrt(pvar1)
,tlo = newdf$Reaction-cmult*sqrt(tvar1)
,thi = newdf$Reaction+cmult*sqrt(tvar1))
gg0 +
geom_point(data=newdf,aes(y=Reaction)) +
geom_ribbon(data=newdf,
aes(ymin=plo, ymax=phi, fill=Treatment), alpha=0.4,
colour=NA)
与估计的斜率和标准误差进行比较:
m0 <- lm(Reaction~height*Treatment,sleepstudy)
ff <- function(m) {
print(coef(summary(m))[-1,c("Estimate","Std. Error")],digits=2)
}
> ff(m0)
## Estimate Std. Error
## height -0.3 0.94
## TreatmentDrug -602.2 234.01
## height:TreatmentDrug 3.5 1.34
ff(m.sleep)
## Estimate Std. Error
## TreatmentDrug -55.03 425.3
## height 0.41 1.7
## Days 10.47 1.5
## TreatmentDrug:height 0.33 2.4
这看起来 consistent/about 正确:混合模型给出了相对于高度和 height:treatment 相互作用的斜率更大的标准误差。 (TreatmentDrug
的主要效果看起来很疯狂,因为它们是 height==0
治疗的预期效果 ...)
作为交叉检查,我可以从 sjPlot::plot_model()
...
得到类似的答案
library(sjPlot)
plot_model(m.sleep, type="pred", terms=c("height","Treatment"))
我想知道为什么从线性混合效应模型计算 95% 置信带时 ggplot2 会产生比手动计算时更窄的带,例如在这里遵循 Ben Bolker 的方法 confidence intervals on predictions。也就是说,ggplot2 是否给出了不准确的模型表示?
这是一个使用 sleepstudy 数据集的可重现示例(修改为在结构上类似于我正在处理的 df):
data("sleepstudy") # load dataset
height <- seq(165, 185, length.out = 18) # create vector called height
Treatment <- rep(c("Control", "Drug"), 9) # create vector called treatment
Subject <- levels(sleepstudy$Subject) # get vector of Subject
ht.subject <- data.frame(height, Subject, Treatment)
sleepstudy <- dplyr::left_join(sleepstudy, ht.subject, by="Subject") # Append df so that each subject has its own height and treatment
sleepstudy$Treatment <- as.factor(sleepstudy$Treatment)
生成模型,将预测添加到原始 df,然后绘图
m.sleep <- lmer(Reaction ~ Treatment*height + (1 + Days|Subject), data=sleepstudy)
sleepstudy$pred <- predict(m.sleep)
ggplot(sleepstudy, aes(height, pred, col=Treatment)) + geom_smooth(method="lm")[2]
按照 Bolker 方法计算置信区间
newdf <- expand.grid(height=seq(165, 185, 1),
Treatment=c("Control","Drug"))
newdf$Reaction <- predict(m.sleep, newdf, re.form=NA)
modmat <- model.matrix(terms(m.sleep), newdf)
pvar1 <- diag(modmat %*% tcrossprod(vcov(m.sleep), modmat))
tvar1 <- pvar1+VarCorr(m.sleep)$Subject[1]
cmult <- 1.96
newdf <- data.frame(newdf
,plo = newdf$Reaction-cmult*sqrt(pvar1)
,phi = newdf$Reaction+cmult*sqrt(pvar1)
,tlo = newdf$Reaction-cmult*sqrt(tvar1)
,thi = newdf$Reaction+cmult*sqrt(tvar1))
# plot confidence intervals
ggplot(newdf, aes(x=height, y=Reaction, colour=Treatment)) +
geom_point() +
geom_ribbon(aes(ymin=plo, ymax=phi, fill=Treatment), alpha=0.4)[2]
经过一些调整,这似乎是一致的。置信区间确实更大,但不会大很多。请记住,ggplot 适合 非常 不同的模型;它通过忽略 (1) 重复测量和 (2) 日效应的处理来拟合单独的线性(非线性混合)模型。
拟合随机斜率但没有人口水平斜率(e.g.see here)的模型似乎很奇怪,所以我添加了Days
的固定效应:
m.sleep <- lmer(Reaction ~ Treatment*height + Days +
(1 + Days|Subject),
data=sleepstudy)
我稍微重新组织了绘图代码:
theme_set(theme_bw())
gg0 <- ggplot(sleepstudy, aes(height, colour=Treatment)) +
geom_point(aes(y=Reaction))+
geom_smooth(aes(y=pred), method="lm")
- 如果你想计算置信区间(这与
lm()
/ggplot2
所做的相当),那么你可能应该 而不是 添加VarCorr(m.sleep)$Subject[1]
到方差(来自 FAQ example 的tvar1
变量用于创建 预测区间 而不是置信区间 ...) - 因为我在上面的模型中有
Days
,所以我将mean(sleepstudy$Days)
添加到预测数据框。
newdf <- expand.grid(height=seq(165, 185, 1),
Treatment=c("Control","Drug"),
Days=mean(sleepstudy$Days))
newdf$Reaction <- newdf$pred <- predict(m.sleep, newdf, re.form=NA)
modmat <- model.matrix(terms(m.sleep), newdf)
pvar1 <- diag(modmat %*% tcrossprod(vcov(m.sleep), modmat))
tvar1 <- pvar1
cmult <- 1.96
newdf <- data.frame(newdf
,plo = newdf$Reaction-cmult*sqrt(pvar1)
,phi = newdf$Reaction+cmult*sqrt(pvar1)
,tlo = newdf$Reaction-cmult*sqrt(tvar1)
,thi = newdf$Reaction+cmult*sqrt(tvar1))
gg0 +
geom_point(data=newdf,aes(y=Reaction)) +
geom_ribbon(data=newdf,
aes(ymin=plo, ymax=phi, fill=Treatment), alpha=0.4,
colour=NA)
与估计的斜率和标准误差进行比较:
m0 <- lm(Reaction~height*Treatment,sleepstudy)
ff <- function(m) {
print(coef(summary(m))[-1,c("Estimate","Std. Error")],digits=2)
}
> ff(m0)
## Estimate Std. Error
## height -0.3 0.94
## TreatmentDrug -602.2 234.01
## height:TreatmentDrug 3.5 1.34
ff(m.sleep)
## Estimate Std. Error
## TreatmentDrug -55.03 425.3
## height 0.41 1.7
## Days 10.47 1.5
## TreatmentDrug:height 0.33 2.4
这看起来 consistent/about 正确:混合模型给出了相对于高度和 height:treatment 相互作用的斜率更大的标准误差。 (TreatmentDrug
的主要效果看起来很疯狂,因为它们是 height==0
治疗的预期效果 ...)
作为交叉检查,我可以从 sjPlot::plot_model()
...
library(sjPlot)
plot_model(m.sleep, type="pred", terms=c("height","Treatment"))