在 quantreg 框架之外的 R 中引导 CI 进行分位数回归
Bootstrapping CI for a quantile regression in R outside the quantreg framework
我有一个对象,model1
,来自分位数回归。在 model1
中,我有 3 列和 99 行,步长为 1 个百分点,如下所示:
> model1
tau intercept julian_day
1 0.01 17.25584 -0.02565709
2 0.02 19.11576 -0.04716498
3 0.03 20.92364 -0.07115870
4 0.04 22.74398 -0.09706853
5 0.05 24.54071 -0.12248334
6 0.06 26.19349 -0.14133451
7 0.07 27.83602 -0.15839657
8 0.08 29.64554 -0.18364227
我想使用引导方法计算并绘制截距和变量值的置信区间。
我没有使用 quantreg
包进行分位数回归,而是使用了这个方法:
#Functions
minimize.logcosh <- function(par, X, y, tau) {
diff <- y-(X %*% par)
check <- (tau-0.5)*diff+(0.5/0.7)*logcosh(0.7*diff)+0.4
return(sum(check))
}
smrq <- function(X, y, tau){
p <- ncol(X)
op.result <- optim(
rep(0, p),
fn = minimize.logcosh,
method = 'BFGS',
X = X,
y = y,
tau = tau
)
beta <- op.result$par
return(beta)
}
run_smrq <- function(data, fml, response) {
x <- model.matrix(fml, data) #modify
y <- data[[response]]
#X <- cbind(x, rep(1,nrow(x)))
X <- x
n <- 99
betas <- sapply(1:n, function(i) smrq(X, y, tau=i/(n+1)))
return(betas)
}
#Callers
smrq_models <- df %>%
group_by(lat_grouped) %>%
group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day"))
这是 df
的样子:
> dput(head(df, 20))
structure(list(lat = c("59", "59", "55", "59", "59", "63", "59",
"59", "59", "59", "63", "59", "59", "59", "57", "56", "56", "59",
"63", "63"), long = c(18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18), date = c("1951-03-22",
"1951-04-08", "1952-02-03", "1952-03-08", "1953-02-22", "1953-03-12",
"1954-01-16", "1954-02-06", "1954-03-14", "1954-03-28", "1954-04-02",
"1955-01-23", "1955-03-06", "1955-03-13", "1955-04-08", "1955-04-11",
"1955-04-12", "1956-03-25", "1956-04-01", "1956-04-02"), julian_day = c(81,
98, 34, 68, 53, 71, 16, 37, 73, 87, 92, 23, 65, 72, 98, 101,
102, 85, 92, 93), year = c(1951L, 1951L, 1952L, 1952L, 1953L,
1953L, 1954L, 1954L, 1954L, 1954L, 1954L, 1955L, 1955L, 1955L,
1955L, 1955L, 1955L, 1956L, 1956L, 1956L), decade = c("1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959"), time = c(10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L), lat_grouped = c("1", "1", "1",
"1", "1", "2", "1", "1", "1", "1", "2", "1", "1", "1", "1", "1",
"1", "1", "2", "2"), year_centered = structure(c(-36, -36, -35,
-35, -34, -34, -33, -33, -33, -33, -33, -32, -32, -32, -32, -32,
-32, -31, -31, -31), class = "AsIs")), row.names = 24:43, class = "data.frame")
关于如何实现我的目标,您有什么建议吗?我已经尝试了很多在互联网上找到的引导功能,但到目前为止还没有成功。
非常感谢您的帮助,如果我遗漏了什么,请不要犹豫,我可以更新我的post!
首先,问题的代码中缺少一个函数。
logcosh <- function(x) log(cosh(x))
至于这个问题,写一个函数来对data.frame的行进行替换采样,运行分位数回归就像问题中一样。无需将它们保存在对象中 smrq_models
因为结果会立即返回给调用者。
请注意,我已从函数 run_smrq
中删除常量 n = 99
,它现在是一个参数。
然后循环调用boot函数
最后,要进行自举估计,必须首先将分位数回归代码返回的列表制成一个数组,以便 apply
计算平均值。分配行名称,我们就完成了。
library(tidyverse)
boot_fun <- function(data, n) {
i <- sample(nrow(data), nrow(data), replace = TRUE)
df <- data[i, ]
df %>%
group_by(lat_grouped) %>%
group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day", n=n))
}
set.seed(2022)
n <- 99L
R <- 10L
boot_smrq_models <- vector("list", length = R)
for(i in seq.int(R)) {
boot_smrq_models[[i]] <- boot_fun(df, n)
}
l <- length(boot_smrq_models[[1]])
smrq_models_all <- vector("list", length = l)
smrq_models_int <- vector("list", length = l)
for(i in seq.int(l)) {
tmp <- array(dim = c(R, dim(boot_smrq_models[[1]][[i]])))
for(j in seq.int(R)) {
tmp[j, , ] <- boot_smrq_models[[j]][[i]]
}
smrq_models_all[[i]] <- t(apply(tmp, 2:3, mean))
smrq_models_int[[i]] <- apply(tmp, 2:3, quantile, probs = c(0.025, 0.975))
rownames(smrq_models_all[[i]]) <- sprintf("tau_%03.02f", (1:99)/(99+1))
}
lapply(smrq_models_all, head)
#> [[1]]
#> [,1] [,2]
#> tau_0.01 -23.173185 -1.4624225
#> tau_0.02 3.509690 -0.6640458
#> tau_0.03 -13.748280 -1.2172158
#> tau_0.04 -38.373045 -1.9780045
#> tau_0.05 -8.166044 -1.0890235
#> tau_0.06 51.701066 0.6258957
#>
#> [[2]]
#> [,1] [,2]
#> tau_0.01 1.387311 -2.315114
#> tau_0.02 36.060786 -1.311070
#> tau_0.03 9.177731 -2.111396
#> tau_0.04 22.320567 -1.731685
#> tau_0.05 40.143578 -1.212108
#> tau_0.06 19.900275 -1.812859
由 reprex package (v2.0.1)
创建于 2022-05-22
我有一个对象,model1
,来自分位数回归。在 model1
中,我有 3 列和 99 行,步长为 1 个百分点,如下所示:
> model1
tau intercept julian_day
1 0.01 17.25584 -0.02565709
2 0.02 19.11576 -0.04716498
3 0.03 20.92364 -0.07115870
4 0.04 22.74398 -0.09706853
5 0.05 24.54071 -0.12248334
6 0.06 26.19349 -0.14133451
7 0.07 27.83602 -0.15839657
8 0.08 29.64554 -0.18364227
我想使用引导方法计算并绘制截距和变量值的置信区间。
我没有使用 quantreg
包进行分位数回归,而是使用了这个方法:
#Functions
minimize.logcosh <- function(par, X, y, tau) {
diff <- y-(X %*% par)
check <- (tau-0.5)*diff+(0.5/0.7)*logcosh(0.7*diff)+0.4
return(sum(check))
}
smrq <- function(X, y, tau){
p <- ncol(X)
op.result <- optim(
rep(0, p),
fn = minimize.logcosh,
method = 'BFGS',
X = X,
y = y,
tau = tau
)
beta <- op.result$par
return(beta)
}
run_smrq <- function(data, fml, response) {
x <- model.matrix(fml, data) #modify
y <- data[[response]]
#X <- cbind(x, rep(1,nrow(x)))
X <- x
n <- 99
betas <- sapply(1:n, function(i) smrq(X, y, tau=i/(n+1)))
return(betas)
}
#Callers
smrq_models <- df %>%
group_by(lat_grouped) %>%
group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day"))
这是 df
的样子:
> dput(head(df, 20))
structure(list(lat = c("59", "59", "55", "59", "59", "63", "59",
"59", "59", "59", "63", "59", "59", "59", "57", "56", "56", "59",
"63", "63"), long = c(18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18), date = c("1951-03-22",
"1951-04-08", "1952-02-03", "1952-03-08", "1953-02-22", "1953-03-12",
"1954-01-16", "1954-02-06", "1954-03-14", "1954-03-28", "1954-04-02",
"1955-01-23", "1955-03-06", "1955-03-13", "1955-04-08", "1955-04-11",
"1955-04-12", "1956-03-25", "1956-04-01", "1956-04-02"), julian_day = c(81,
98, 34, 68, 53, 71, 16, 37, 73, 87, 92, 23, 65, 72, 98, 101,
102, 85, 92, 93), year = c(1951L, 1951L, 1952L, 1952L, 1953L,
1953L, 1954L, 1954L, 1954L, 1954L, 1954L, 1955L, 1955L, 1955L,
1955L, 1955L, 1955L, 1956L, 1956L, 1956L), decade = c("1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959"), time = c(10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L), lat_grouped = c("1", "1", "1",
"1", "1", "2", "1", "1", "1", "1", "2", "1", "1", "1", "1", "1",
"1", "1", "2", "2"), year_centered = structure(c(-36, -36, -35,
-35, -34, -34, -33, -33, -33, -33, -33, -32, -32, -32, -32, -32,
-32, -31, -31, -31), class = "AsIs")), row.names = 24:43, class = "data.frame")
关于如何实现我的目标,您有什么建议吗?我已经尝试了很多在互联网上找到的引导功能,但到目前为止还没有成功。
非常感谢您的帮助,如果我遗漏了什么,请不要犹豫,我可以更新我的post!
首先,问题的代码中缺少一个函数。
logcosh <- function(x) log(cosh(x))
至于这个问题,写一个函数来对data.frame的行进行替换采样,运行分位数回归就像问题中一样。无需将它们保存在对象中 smrq_models
因为结果会立即返回给调用者。
请注意,我已从函数 run_smrq
中删除常量 n = 99
,它现在是一个参数。
然后循环调用boot函数
最后,要进行自举估计,必须首先将分位数回归代码返回的列表制成一个数组,以便 apply
计算平均值。分配行名称,我们就完成了。
library(tidyverse)
boot_fun <- function(data, n) {
i <- sample(nrow(data), nrow(data), replace = TRUE)
df <- data[i, ]
df %>%
group_by(lat_grouped) %>%
group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day", n=n))
}
set.seed(2022)
n <- 99L
R <- 10L
boot_smrq_models <- vector("list", length = R)
for(i in seq.int(R)) {
boot_smrq_models[[i]] <- boot_fun(df, n)
}
l <- length(boot_smrq_models[[1]])
smrq_models_all <- vector("list", length = l)
smrq_models_int <- vector("list", length = l)
for(i in seq.int(l)) {
tmp <- array(dim = c(R, dim(boot_smrq_models[[1]][[i]])))
for(j in seq.int(R)) {
tmp[j, , ] <- boot_smrq_models[[j]][[i]]
}
smrq_models_all[[i]] <- t(apply(tmp, 2:3, mean))
smrq_models_int[[i]] <- apply(tmp, 2:3, quantile, probs = c(0.025, 0.975))
rownames(smrq_models_all[[i]]) <- sprintf("tau_%03.02f", (1:99)/(99+1))
}
lapply(smrq_models_all, head)
#> [[1]]
#> [,1] [,2]
#> tau_0.01 -23.173185 -1.4624225
#> tau_0.02 3.509690 -0.6640458
#> tau_0.03 -13.748280 -1.2172158
#> tau_0.04 -38.373045 -1.9780045
#> tau_0.05 -8.166044 -1.0890235
#> tau_0.06 51.701066 0.6258957
#>
#> [[2]]
#> [,1] [,2]
#> tau_0.01 1.387311 -2.315114
#> tau_0.02 36.060786 -1.311070
#> tau_0.03 9.177731 -2.111396
#> tau_0.04 22.320567 -1.731685
#> tau_0.05 40.143578 -1.212108
#> tau_0.06 19.900275 -1.812859
由 reprex package (v2.0.1)
创建于 2022-05-22