mgcv:标准误差在 predict.bam() 中与 discrete = true 不同
mgcv: standard errors differ in predict.bam() with discrete = true
我正在使用 bam()
和 discrete=TRUE
拟合大型多级模型以加快计算速度。然后我想根据该模型进行预测,同时包括或忽略某些随机效应项,我知道这可以通过 predict.bam()
的 terms
参数来完成。但是当我更改 predict.bam()
中的 discrete
选项时,我发现结果不一致,我不确定哪个是正确的。
使用所有平滑术语时,一切看起来都很好。但是当只选择一些带有 discrete = TRUE
的平滑项时,预测值与装有 gam()
的模型相同,但标准误差不同。这有时会使它们膨胀,有时会降低标准误差。使用 discrete=FALSE
产生的结果与装有 gam()
的模型一致。那么 predict.bam()
某处有错误吗?哪种方法计算正确?
这是一个可重现的输出示例:
library(lme4)
library(mgcv)
data(sleepstudy)
model <- gam(Reaction ~ Days + s(Subject, bs = "re") + s(Days, Subject, bs = "re"),
data = sleepstudy,
method = "fREML"
)
model_d <- bam(Reaction ~ Days + s(Subject, bs = "re") + s(Days, Subject, bs = "re"),
data = sleepstudy,
method = "fREML"
,discrete=TRUE
)
## including all smooth terms is fine
head(
data.frame(
gam1 = predict(model),
gam2 = predict(model_d, discrete=TRUE),
gam3 = predict(model_d, discrete=FALSE)
)
)
# gam1 gam2 gam3
# 1 252.9178 252.9178 252.9178
# 2 272.7086 272.7086 272.7086
# 3 292.4994 292.4994 292.4994
# 4 312.2901 312.2901 312.2901
# 5 332.0809 332.0809 332.0809
# 6 351.8717 351.8717 351.8717
head(
data.frame(
gam1 = predict(model,se.fit=TRUE)$se.fit,
gam2 = predict(model_d, discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, discrete=FALSE, se.fit=TRUE)$se.fit
)
)
# gam1 gam2 gam3
# 1 12.410215 12.410215 12.410215
# 2 10.660886 10.660886 10.660886
# 3 9.191220 9.191220 9.191220
# 4 8.153867 8.153867 8.153867
# 5 7.724996 7.724996 7.724996
# 6 8.003034 8.003034 8.003034
## ---- selecting only some smooth terms
## with discrete = TRUE, predicted values are the same but
## standard errors returned are the same as those with all smooths included.
## This sometimes inflates them and sometimes reduces them.
head(
data.frame(
gam1 = predict(model, terms=c("s(Subject)")),
gam2 = predict(model_d, terms=c("s(Subject)")),
gam3 = predict(model_d, terms=c("s(Subject)"))
)
)
# gam1 gam2 gam3
# 1 252.9178 252.9178 252.9178
# 2 263.3851 272.7086 272.7086
# 3 273.8524 292.4994 292.4994
# 4 284.3197 312.2901 312.2901
# 5 294.7869 332.0809 332.0809
# 6 305.2542 351.8717 351.8717
head(
data.frame(
gam1 = predict(model, terms=c("s(Subject)"),se.fit=TRUE)$se.fit,
gam2 = predict(model_d, terms=c("s(Subject)"), discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, terms=c("s(Subject)"), discrete=FALSE, se.fit=TRUE)$se.fit
)
)
# gam1 gam2 gam3
# 1 12.41021 12.410215 12.41021
# 2 12.34846 10.660886 12.34846
# 3 12.48280 9.191220 12.48280
# 4 12.80704 8.153867 12.80704
# 5 13.30733 7.724996 13.30733
# 6 13.96474 8.003034 13.96474
head(
data.frame(
gam1 = predict(model, terms=c("s(Days, Subject)"),se.fit=TRUE)$se.fit,
gam2 = predict(model_d, terms=c("s(Days, Subject)"), discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, terms=c("s(Days, Subject)"), discrete=FALSE, se.fit=TRUE)$se.fit
)
)
# gam1 gam2 gam3
# 1 6.885381 12.410215 6.885381
# 2 6.773449 10.660886 6.773449
# 3 7.015357 9.191220 7.015357
# 4 7.577292 8.153867 7.577292
# 5 8.395234 7.724996 8.395234
# 6 9.402609 8.003034 9.402609
已更新
我做了更多的挖掘,可能已经回答了我自己的问题,但仍然感谢任何能提供更多专业知识的人。
我尝试使用 predict(..., method="lpmatrix")
手动计算事物,遵循 Gavin Simpson 的这个有用的 blog post。
看起来 discrete=TRUE
输出是错误的,这是某种错误。
这段代码接续前面的代码:
### ---- manual computation with simulation via lpmatrix
mvrnorm <- MASS::mvrnorm
lp <- predict(model_d, type = "lpmatrix")
coefs <- coef(model_d)
vc <- vcov(model_d)
set.seed(123)
sim <- mvrnorm(5e4, mu = coefs, Sigma = vc)
fits <- lp %*% t(sim)
se.fit <- apply(fits, 1, sd)
## with all effects
head(
data.frame(
gam1 = predict(model,se.fit=TRUE)$se.fit,
gam2 = predict(model_d, discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, discrete=FALSE, se.fit=TRUE)$se.fit,
man = se.fit
)
)
# gam1 gam2 gam3 man
# 1 12.410220 12.410215 12.410215 12.453005
# 2 10.660891 10.660886 10.660886 10.704449
# 3 9.191224 9.191220 9.191220 9.235621
# 4 8.153871 8.153867 8.153867 8.198276
# 5 7.724998 7.724996 7.724996 7.767261
# 6 8.003034 8.003034 8.003034 8.040678
## ---- with only s(Subject) random effects
want <- c(c(1,2), grep("s\(Subject\)", colnames(lp))) # regex is obnoxious here
fits <- lp[, want] %*% t(sim[, want])
se.fit <- apply(fits, 1, sd)
head(
data.frame(
gam1 = predict(model, terms=c("s(Subject)"),se.fit=TRUE)$se.fit,
gam2 = predict(model_d, terms=c("s(Subject)"), discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, terms=c("s(Subject)"), discrete=FALSE, se.fit=TRUE)$se.fit,
man = se.fit
)
)
# gam1 gam2 gam3 man
# 1 12.41022 12.410215 12.41021 12.45300
# 2 12.34847 10.660886 12.34846 12.39594
# 3 12.48280 9.191220 12.48280 12.53395
# 4 12.80704 8.153867 12.80704 12.86074
# 5 13.30733 7.724996 13.30733 13.36248
# 6 13.96474 8.003034 13.96474 14.02039
这是离散预测代码中的错误。已修复 mgcv_1.8-32。谢谢!西蒙
我正在使用 bam()
和 discrete=TRUE
拟合大型多级模型以加快计算速度。然后我想根据该模型进行预测,同时包括或忽略某些随机效应项,我知道这可以通过 predict.bam()
的 terms
参数来完成。但是当我更改 predict.bam()
中的 discrete
选项时,我发现结果不一致,我不确定哪个是正确的。
使用所有平滑术语时,一切看起来都很好。但是当只选择一些带有 discrete = TRUE
的平滑项时,预测值与装有 gam()
的模型相同,但标准误差不同。这有时会使它们膨胀,有时会降低标准误差。使用 discrete=FALSE
产生的结果与装有 gam()
的模型一致。那么 predict.bam()
某处有错误吗?哪种方法计算正确?
这是一个可重现的输出示例:
library(lme4)
library(mgcv)
data(sleepstudy)
model <- gam(Reaction ~ Days + s(Subject, bs = "re") + s(Days, Subject, bs = "re"),
data = sleepstudy,
method = "fREML"
)
model_d <- bam(Reaction ~ Days + s(Subject, bs = "re") + s(Days, Subject, bs = "re"),
data = sleepstudy,
method = "fREML"
,discrete=TRUE
)
## including all smooth terms is fine
head(
data.frame(
gam1 = predict(model),
gam2 = predict(model_d, discrete=TRUE),
gam3 = predict(model_d, discrete=FALSE)
)
)
# gam1 gam2 gam3
# 1 252.9178 252.9178 252.9178
# 2 272.7086 272.7086 272.7086
# 3 292.4994 292.4994 292.4994
# 4 312.2901 312.2901 312.2901
# 5 332.0809 332.0809 332.0809
# 6 351.8717 351.8717 351.8717
head(
data.frame(
gam1 = predict(model,se.fit=TRUE)$se.fit,
gam2 = predict(model_d, discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, discrete=FALSE, se.fit=TRUE)$se.fit
)
)
# gam1 gam2 gam3
# 1 12.410215 12.410215 12.410215
# 2 10.660886 10.660886 10.660886
# 3 9.191220 9.191220 9.191220
# 4 8.153867 8.153867 8.153867
# 5 7.724996 7.724996 7.724996
# 6 8.003034 8.003034 8.003034
## ---- selecting only some smooth terms
## with discrete = TRUE, predicted values are the same but
## standard errors returned are the same as those with all smooths included.
## This sometimes inflates them and sometimes reduces them.
head(
data.frame(
gam1 = predict(model, terms=c("s(Subject)")),
gam2 = predict(model_d, terms=c("s(Subject)")),
gam3 = predict(model_d, terms=c("s(Subject)"))
)
)
# gam1 gam2 gam3
# 1 252.9178 252.9178 252.9178
# 2 263.3851 272.7086 272.7086
# 3 273.8524 292.4994 292.4994
# 4 284.3197 312.2901 312.2901
# 5 294.7869 332.0809 332.0809
# 6 305.2542 351.8717 351.8717
head(
data.frame(
gam1 = predict(model, terms=c("s(Subject)"),se.fit=TRUE)$se.fit,
gam2 = predict(model_d, terms=c("s(Subject)"), discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, terms=c("s(Subject)"), discrete=FALSE, se.fit=TRUE)$se.fit
)
)
# gam1 gam2 gam3
# 1 12.41021 12.410215 12.41021
# 2 12.34846 10.660886 12.34846
# 3 12.48280 9.191220 12.48280
# 4 12.80704 8.153867 12.80704
# 5 13.30733 7.724996 13.30733
# 6 13.96474 8.003034 13.96474
head(
data.frame(
gam1 = predict(model, terms=c("s(Days, Subject)"),se.fit=TRUE)$se.fit,
gam2 = predict(model_d, terms=c("s(Days, Subject)"), discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, terms=c("s(Days, Subject)"), discrete=FALSE, se.fit=TRUE)$se.fit
)
)
# gam1 gam2 gam3
# 1 6.885381 12.410215 6.885381
# 2 6.773449 10.660886 6.773449
# 3 7.015357 9.191220 7.015357
# 4 7.577292 8.153867 7.577292
# 5 8.395234 7.724996 8.395234
# 6 9.402609 8.003034 9.402609
已更新
我做了更多的挖掘,可能已经回答了我自己的问题,但仍然感谢任何能提供更多专业知识的人。
我尝试使用 predict(..., method="lpmatrix")
手动计算事物,遵循 Gavin Simpson 的这个有用的 blog post。
看起来 discrete=TRUE
输出是错误的,这是某种错误。
这段代码接续前面的代码:
### ---- manual computation with simulation via lpmatrix
mvrnorm <- MASS::mvrnorm
lp <- predict(model_d, type = "lpmatrix")
coefs <- coef(model_d)
vc <- vcov(model_d)
set.seed(123)
sim <- mvrnorm(5e4, mu = coefs, Sigma = vc)
fits <- lp %*% t(sim)
se.fit <- apply(fits, 1, sd)
## with all effects
head(
data.frame(
gam1 = predict(model,se.fit=TRUE)$se.fit,
gam2 = predict(model_d, discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, discrete=FALSE, se.fit=TRUE)$se.fit,
man = se.fit
)
)
# gam1 gam2 gam3 man
# 1 12.410220 12.410215 12.410215 12.453005
# 2 10.660891 10.660886 10.660886 10.704449
# 3 9.191224 9.191220 9.191220 9.235621
# 4 8.153871 8.153867 8.153867 8.198276
# 5 7.724998 7.724996 7.724996 7.767261
# 6 8.003034 8.003034 8.003034 8.040678
## ---- with only s(Subject) random effects
want <- c(c(1,2), grep("s\(Subject\)", colnames(lp))) # regex is obnoxious here
fits <- lp[, want] %*% t(sim[, want])
se.fit <- apply(fits, 1, sd)
head(
data.frame(
gam1 = predict(model, terms=c("s(Subject)"),se.fit=TRUE)$se.fit,
gam2 = predict(model_d, terms=c("s(Subject)"), discrete=TRUE, se.fit=TRUE)$se.fit,
gam3 = predict(model_d, terms=c("s(Subject)"), discrete=FALSE, se.fit=TRUE)$se.fit,
man = se.fit
)
)
# gam1 gam2 gam3 man
# 1 12.41022 12.410215 12.41021 12.45300
# 2 12.34847 10.660886 12.34846 12.39594
# 3 12.48280 9.191220 12.48280 12.53395
# 4 12.80704 8.153867 12.80704 12.86074
# 5 13.30733 7.724996 13.30733 13.36248
# 6 13.96474 8.003034 13.96474 14.02039
这是离散预测代码中的错误。已修复 mgcv_1.8-32。谢谢!西蒙