Bootstrap R中非线性优化的参数估计:为什么它与常规参数估计不同?
Bootstrap parameter estimate of non-linear optimization in R: Why is it different than the regular parameter estimate?
这是我的问题的简短版本。代码如下。
我使用 optim() 计算了 R 中非线性 von Bertalanffy 增长方程的参数,现在我试图通过 bootstrapping 将 95% 的置信区间添加到 von B 增长系数 K .对于至少一年的数据,当我总结增长系数 K 的 bootstrapped 输出时,bootstrapping 的均值和中值参数估计值与估计参数有很大不同:
>summary(temp.store) # summary of bootstrap values
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.002449 0.005777 0.010290 0.011700 0.016970 0.056720
> est.K [1] 0.01655956 # point-estimate from the optimization
我怀疑差异是因为随机抽取的 bootstrap 中存在错误,导致结果出现偏差,尽管我使用 try() 来阻止优化在输入组合时崩溃导致错误的值。所以我想知道如何解决这个问题。我认为我做的事情是正确的,因为拟合曲线看起来是正确的。
此外,我有 运行 其他年份数据的代码,并且至少在另外一年中,bootrap 估计值和常规估计值非常接近。
冗长的版本:
长度的 von Bertalanffy 增长曲线 (VBGC) 由下式给出:
L(t) = L.inf * [1 - exp(-K*(t-t0))] (等式 3.1.0.1,来自 FAO)
其中 L(t) 是鱼的长度,L.inf 是渐近最大长度,K 是生长系数,t 是时间步长,t0 是生长开始的时间。 L(t) 和 t 是观测数据。通常时间或年龄以年为单位,但在这里我查看的是幼鱼数据,我将 t 定为一年中的一天 ("doy") 从 1 月 1 日开始 = 1.
为了估计优化的起始参数,我使用了 VBGC 方程的线性化。
doy <- c(156,205,228,276,319,380)
len <- c(36,56,60,68,68,71)
data06 <- data.frame(doy,len)
获取优化起始参数的函数:
get.init <-function(dframe){ # linearization of the von B
l.inf <- 80 # by eyeballing max juvenile fish
# make a response variable and store it in the data frame:
# Eqn. 3.3.3.1 in FAO document
dframe$vonb.y <- - log(1 - (dframe$len)/l.inf )
lin.vonb <- lm(vonb.y ~ doy, data=dframe)
icept <- lin.vonb$coef[1] # 0.01534013 # intercept is a
slope <- k.lin <- lin.vonb$coef[2] # slope is the K param
t0 <- - icept/slope # get t0 from this relship: intercept= -K * t0
pars <- c(l.inf,as.numeric(slope),as.numeric(t0))
}
von Bertalanffy 增长方程的平方和
vbl.ssq <- function(theta, data){
linf=theta[1]; k=theta[2]; t0=theta[3]
# name variables for ease of use
obs.length=data$len
age=data$doy
#von B equation
pred.length=linf*(1-exp(-k*(age-t0)))
#sums of squares
ssq=sum((obs.length-pred.length)^2)
}
估计参数
#Get starting parameter values
theta_init <- get.init(dframe=data06)
# optimize VBGC by minimizing sums of square differences
len.fit <- optim(par=theta_init, fn=vbl.ssq, method="BFGS", data=data06)
est.linf <- len.fit$par[1] # vonB len-infinite
est.K <- len.fit$par[2] # vonB K
est.t0 <- len.fit$par[3] # vonB t0
自举
# set up for bootstrap loop
tmp.frame <- data.frame()
temp.store <- vector()
# bootstrap to get 95% conf ints on growth coef K
for (j in 1:1000){
# choose indices at random, with replacement
indices <- sample(1:length(data06[,1]),replace=T)
# values from original data corresponding to those indices
new.len <- data06$len[indices]
new.doy <- data06$doy[indices]
tmp.frame <- data.frame(new.doy,new.len)
colnames(tmp.frame) <- c("doy","len")
init.par <- get.init(tmp.frame)
# now get the vonB params for the randomly selected samples
# using try() to keep optimizing errors from crashing the program
try( len.fit.bs <- optim(par=init.par, fn=vbl.ssq, method="BFGS", data=tmp.frame))
tmp.k <- len.fit.bs$par[2]
temp.store[j] <- tmp.k
}
K 参数的 95% 置信区间
k.ci <- quantile(temp.store,c(0.025,0.975))
# 2.5% 97.5%
#0.004437702 0.019784178
这是问题所在:
#>summary(temp.store)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.002449 0.005777 0.010290 0.011700 0.016970 0.056720
#
# est.K [1] 0.01655956
错误示例:
Error in optim(par = init.par, fn = vbl.ssq, method = "BFGS", data = tmp.frame) :
non-finite finite-difference value [2]
我认为我在优化方面没有犯任何错误,因为 VBGC 拟合看起来很合理。以下是情节:
plot(x=data06$doy,y=data06$len,xlim=c(0,550),ylim=c(0,100))
legend(x="topleft",legend=paste("Length curve 2006"), bty="n")
curve(est.linf*(1-exp(-est.K*(x-est.t0))), add=T,type="l")
plot(x=2006,y=est.K, main="von B growth coefficient for length; 95% CIs",
ylim=c(0,0.025))
arrows(x0=2006,y0=k.ci[1],x1=2006,y1=k.ci[2], code=3,
angle=90,length=0.1)
首先,您的值非常少,可能太少以至于无法相信 bootstrap 方法。然后经典 bootstrap 的大部分拟合失败,因为由于重采样,您通常没有足够的不同 x 值。
这是一个使用 nls
和自启动模型和启动包的实现。
doy <- c(156,205,228,276,319,380)
len <- c(36,56,60,68,68,71)
data06 <- data.frame(doy,len)
plot(len ~ doy, data = data06)
fit <- nls(len ~ SSasympOff(doy, Asym, lrc, c0), data = data06)
summary(fit)
#profiling CI
proCI <- confint(fit)
# 2.5% 97.5%
#Asym 68.290477 75.922174
#lrc -4.453895 -3.779994
#c0 94.777335 126.112523
curve(predict(fit, newdata = data.frame(doy = x)), add = TRUE)
#classic bootstrap
library(boot)
set.seed(42)
boot1 <- boot(data06, function(DF, i) {
tryCatch(coef(nls(len ~ SSasympOff(doy, Asym, lrc, c0), data = DF[i,])),
error = function(e) c(Asym = NA, lrc = NA, c0 = NA))
}, R = 1e3)
#proportion of unsuccessful fits
mean(is.na(boot1$t[, 1]))
#[1] 0.256
#bootstrap CI
boot1CI <- apply(boot1$t, 2, quantile, probs = c(0.025, 0.5, 0.975), na.rm = TRUE)
# [,1] [,2] [,3]
#2.5% 69.70360 -4.562608 67.60152
#50% 71.56527 -4.100148 113.9287
#97.5% 74.79921 -3.697461 151.03541
#bootstrap of the residuals
data06$res <- residuals(fit)
data06$fit <- fitted(fit)
set.seed(42)
boot2 <- boot(data06, function(DF, i) {
DF$lenboot <- DF$fit + DF[i, "res"]
tryCatch(coef(nls(lenboot ~ SSasympOff(doy, Asym, lrc, c0), data = DF)),
error = function(e) c(Asym = NA, lrc = NA, c0 = NA))
}, R = 1e3)
#proportion of unsuccessful fits
mean(is.na(boot2$t[, 1]))
#[1] 0
#(residuals) bootstrap CI
boot2CI <- apply(boot2$t, 2, quantile, probs = c(0.025, 0.5, 0.975), na.rm = TRUE)
# [,1] [,2] [,3]
#2.5% 70.19380 -4.255165 106.3136
#50% 71.56527 -4.100148 113.9287
#97.5% 73.37461 -3.969012 119.2380
proCI[2,1]
CIs_k <- data.frame(lwr = c(exp(proCI[2, 1]),
exp(boot1CI[1, 2]),
exp(boot2CI[1, 2])),
upr = c(exp(proCI[2, 2]),
exp(boot1CI[3, 2]),
exp(boot2CI[3, 2])),
med = c(NA,
exp(boot1CI[2, 2]),
exp(boot2CI[2, 2])),
estimate = exp(coef(fit)[2]),
method = c("profile", "boot", "boot res"))
library(ggplot2)
ggplot(CIs_k, aes(y = estimate, ymin = lwr, ymax = upr, x = method)) +
geom_errorbar() +
geom_point(aes(color = "estimate"), size = 5) +
geom_point(aes(y = med, color = "boot median"), size = 5) +
ylab("k") + xlab("") +
scale_color_brewer(name = "", type = "qual", palette = 2) +
theme_bw(base_size = 22)
如您所见,bootstrap CI 比配置文件 CI 宽并且 bootstrapping 残差导致更窄的估计 CI .它们几乎都是对称的。此外,中位数接近点估计。
作为调查代码错误的第一步,您应该查看程序中失败的比例。
这是我的问题的简短版本。代码如下。
我使用 optim() 计算了 R 中非线性 von Bertalanffy 增长方程的参数,现在我试图通过 bootstrapping 将 95% 的置信区间添加到 von B 增长系数 K .对于至少一年的数据,当我总结增长系数 K 的 bootstrapped 输出时,bootstrapping 的均值和中值参数估计值与估计参数有很大不同:
>summary(temp.store) # summary of bootstrap values
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.002449 0.005777 0.010290 0.011700 0.016970 0.056720
> est.K [1] 0.01655956 # point-estimate from the optimization
我怀疑差异是因为随机抽取的 bootstrap 中存在错误,导致结果出现偏差,尽管我使用 try() 来阻止优化在输入组合时崩溃导致错误的值。所以我想知道如何解决这个问题。我认为我做的事情是正确的,因为拟合曲线看起来是正确的。
此外,我有 运行 其他年份数据的代码,并且至少在另外一年中,bootrap 估计值和常规估计值非常接近。
冗长的版本:
长度的 von Bertalanffy 增长曲线 (VBGC) 由下式给出: L(t) = L.inf * [1 - exp(-K*(t-t0))] (等式 3.1.0.1,来自 FAO)
其中 L(t) 是鱼的长度,L.inf 是渐近最大长度,K 是生长系数,t 是时间步长,t0 是生长开始的时间。 L(t) 和 t 是观测数据。通常时间或年龄以年为单位,但在这里我查看的是幼鱼数据,我将 t 定为一年中的一天 ("doy") 从 1 月 1 日开始 = 1.
为了估计优化的起始参数,我使用了 VBGC 方程的线性化。
doy <- c(156,205,228,276,319,380)
len <- c(36,56,60,68,68,71)
data06 <- data.frame(doy,len)
获取优化起始参数的函数:
get.init <-function(dframe){ # linearization of the von B
l.inf <- 80 # by eyeballing max juvenile fish
# make a response variable and store it in the data frame:
# Eqn. 3.3.3.1 in FAO document
dframe$vonb.y <- - log(1 - (dframe$len)/l.inf )
lin.vonb <- lm(vonb.y ~ doy, data=dframe)
icept <- lin.vonb$coef[1] # 0.01534013 # intercept is a
slope <- k.lin <- lin.vonb$coef[2] # slope is the K param
t0 <- - icept/slope # get t0 from this relship: intercept= -K * t0
pars <- c(l.inf,as.numeric(slope),as.numeric(t0))
}
von Bertalanffy 增长方程的平方和
vbl.ssq <- function(theta, data){
linf=theta[1]; k=theta[2]; t0=theta[3]
# name variables for ease of use
obs.length=data$len
age=data$doy
#von B equation
pred.length=linf*(1-exp(-k*(age-t0)))
#sums of squares
ssq=sum((obs.length-pred.length)^2)
}
估计参数
#Get starting parameter values
theta_init <- get.init(dframe=data06)
# optimize VBGC by minimizing sums of square differences
len.fit <- optim(par=theta_init, fn=vbl.ssq, method="BFGS", data=data06)
est.linf <- len.fit$par[1] # vonB len-infinite
est.K <- len.fit$par[2] # vonB K
est.t0 <- len.fit$par[3] # vonB t0
自举
# set up for bootstrap loop
tmp.frame <- data.frame()
temp.store <- vector()
# bootstrap to get 95% conf ints on growth coef K
for (j in 1:1000){
# choose indices at random, with replacement
indices <- sample(1:length(data06[,1]),replace=T)
# values from original data corresponding to those indices
new.len <- data06$len[indices]
new.doy <- data06$doy[indices]
tmp.frame <- data.frame(new.doy,new.len)
colnames(tmp.frame) <- c("doy","len")
init.par <- get.init(tmp.frame)
# now get the vonB params for the randomly selected samples
# using try() to keep optimizing errors from crashing the program
try( len.fit.bs <- optim(par=init.par, fn=vbl.ssq, method="BFGS", data=tmp.frame))
tmp.k <- len.fit.bs$par[2]
temp.store[j] <- tmp.k
}
K 参数的 95% 置信区间
k.ci <- quantile(temp.store,c(0.025,0.975))
# 2.5% 97.5%
#0.004437702 0.019784178
这是问题所在:
#>summary(temp.store)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.002449 0.005777 0.010290 0.011700 0.016970 0.056720
#
# est.K [1] 0.01655956
错误示例:
Error in optim(par = init.par, fn = vbl.ssq, method = "BFGS", data = tmp.frame) :
non-finite finite-difference value [2]
我认为我在优化方面没有犯任何错误,因为 VBGC 拟合看起来很合理。以下是情节:
plot(x=data06$doy,y=data06$len,xlim=c(0,550),ylim=c(0,100))
legend(x="topleft",legend=paste("Length curve 2006"), bty="n")
curve(est.linf*(1-exp(-est.K*(x-est.t0))), add=T,type="l")
plot(x=2006,y=est.K, main="von B growth coefficient for length; 95% CIs",
ylim=c(0,0.025))
arrows(x0=2006,y0=k.ci[1],x1=2006,y1=k.ci[2], code=3,
angle=90,length=0.1)
首先,您的值非常少,可能太少以至于无法相信 bootstrap 方法。然后经典 bootstrap 的大部分拟合失败,因为由于重采样,您通常没有足够的不同 x 值。
这是一个使用 nls
和自启动模型和启动包的实现。
doy <- c(156,205,228,276,319,380)
len <- c(36,56,60,68,68,71)
data06 <- data.frame(doy,len)
plot(len ~ doy, data = data06)
fit <- nls(len ~ SSasympOff(doy, Asym, lrc, c0), data = data06)
summary(fit)
#profiling CI
proCI <- confint(fit)
# 2.5% 97.5%
#Asym 68.290477 75.922174
#lrc -4.453895 -3.779994
#c0 94.777335 126.112523
curve(predict(fit, newdata = data.frame(doy = x)), add = TRUE)
#classic bootstrap
library(boot)
set.seed(42)
boot1 <- boot(data06, function(DF, i) {
tryCatch(coef(nls(len ~ SSasympOff(doy, Asym, lrc, c0), data = DF[i,])),
error = function(e) c(Asym = NA, lrc = NA, c0 = NA))
}, R = 1e3)
#proportion of unsuccessful fits
mean(is.na(boot1$t[, 1]))
#[1] 0.256
#bootstrap CI
boot1CI <- apply(boot1$t, 2, quantile, probs = c(0.025, 0.5, 0.975), na.rm = TRUE)
# [,1] [,2] [,3]
#2.5% 69.70360 -4.562608 67.60152
#50% 71.56527 -4.100148 113.9287
#97.5% 74.79921 -3.697461 151.03541
#bootstrap of the residuals
data06$res <- residuals(fit)
data06$fit <- fitted(fit)
set.seed(42)
boot2 <- boot(data06, function(DF, i) {
DF$lenboot <- DF$fit + DF[i, "res"]
tryCatch(coef(nls(lenboot ~ SSasympOff(doy, Asym, lrc, c0), data = DF)),
error = function(e) c(Asym = NA, lrc = NA, c0 = NA))
}, R = 1e3)
#proportion of unsuccessful fits
mean(is.na(boot2$t[, 1]))
#[1] 0
#(residuals) bootstrap CI
boot2CI <- apply(boot2$t, 2, quantile, probs = c(0.025, 0.5, 0.975), na.rm = TRUE)
# [,1] [,2] [,3]
#2.5% 70.19380 -4.255165 106.3136
#50% 71.56527 -4.100148 113.9287
#97.5% 73.37461 -3.969012 119.2380
proCI[2,1]
CIs_k <- data.frame(lwr = c(exp(proCI[2, 1]),
exp(boot1CI[1, 2]),
exp(boot2CI[1, 2])),
upr = c(exp(proCI[2, 2]),
exp(boot1CI[3, 2]),
exp(boot2CI[3, 2])),
med = c(NA,
exp(boot1CI[2, 2]),
exp(boot2CI[2, 2])),
estimate = exp(coef(fit)[2]),
method = c("profile", "boot", "boot res"))
library(ggplot2)
ggplot(CIs_k, aes(y = estimate, ymin = lwr, ymax = upr, x = method)) +
geom_errorbar() +
geom_point(aes(color = "estimate"), size = 5) +
geom_point(aes(y = med, color = "boot median"), size = 5) +
ylab("k") + xlab("") +
scale_color_brewer(name = "", type = "qual", palette = 2) +
theme_bw(base_size = 22)
如您所见,bootstrap CI 比配置文件 CI 宽并且 bootstrapping 残差导致更窄的估计 CI .它们几乎都是对称的。此外,中位数接近点估计。
作为调查代码错误的第一步,您应该查看程序中失败的比例。