在 R 中引导多个变量
Bootstrapping multiple variables in R
我想执行 bootstrap 线性回归,因为担心正态分布的误差项。我有一个大到足以忽略这一事实的数据集,但我只想仔细检查依赖于分析计算的标准误差的线性回归模型是否产生与从 bootstrapped 线性回归获得的结果相同的结果。
到目前为止,我使用了以下代码:
Rel01 <- subset(
Relevante_Variablen2,
select = c(intr, sale_py_at_py, R_at_py,
inflr, dt, re, txt)
)
x = as.data.frame(sapply(Rel01, as.numeric))
boxplot(x)
library(boot)
i<-nrow(x) #count number of rows for resampling
g<-ncol(x) #count number of columns to step through with bootstrapping
boot.mean<-function(x,i){boot.mean<-mean(x[i])} #bootstrapping function to get the mean
boot_tabl <- apply(x,2,function(y){
b<-boot(y,boot.mean,R=50000);
c(mean(b$t),boot.ci(b,type="perc", conf=0.95)$percent[4:5])
})
View(boot_tabl)
一切似乎都有效(输出在下面显示的 table 中),但我不知道如何解释我的方法的输出。
^
sale_py_at_py
intr
inflr
dt
re
txt
1
0.0984438
0.01223
0.04243
200
400
1200
2
0.0974530
0.01191
0.04122
190
300
1000
3
0.0993230
0.01291
0.04256
210
405
1210
我的数据是这样的:
Name
Segment
Sale
Year
Asset
Another header
A
3401
10000
2000
200000
x
A
3401
20000
2001
250000
x
B
2201
15000
2004
280000
x
B
2201
23000
2009
320000
x
B
2201
28000
2010
390000
x
C
2201
30000
2000
210000
x
C
2201
18000
2004
200000
x
D
1
28000
2000
400000
x
D
1
38000
2001
521000
x
任何人都可以就如何 bootstrap 我的线性回归分析提供一些指导吗?并告诉我回归的输出到底告诉我什么?
编辑:
original_data = Relevante_V03
original_model = lm01
set.seed(123) # fix random number generator for reproducibility
boot_lm <- function(original_data, original_model,
type = c('ordinary', 'param'),
B = 1000L, seed = 123) {
set.seed(seed)
betas_original_model <- coef(original_model)
len_coef <- length(betas_original_model)
mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
if (type %in% 'ordinary') {
n_rows <- length(residuals(original_model))
for (i in seq_len(B)) {
boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
}
}
if (type %in% 'param') {
X <- model.matrix(delete.response(terms(original_model)),
data = original_data)[, -1L]
for (i in seq_len(B)) {
mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
data = original_data))
}
}
confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
pvals <- numeric(len_coef)
for (i in seq_len(len_coef)) {
pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
}
names(pvals) <- names(betas_original_model)
out <- data.frame(estimate = betas_original_model,
'lwr' = confints[, 1], 'upr' = confints[, 2],
p_value = pvals)
return(out)
}
# linear model to be bootstrapped
my_split <- split(Relevante_V03, Relevante_V03$sic & Relevante_V03$fdyear) # split Relevante_03 by sic(Segment) & (fd)year
out <- lapply(my_split, function(x) {
lm(marketingspending ~ intr + sale_py_at_py + R_at_py, data = x) # perform linear regression on each company separately
})
ordinary <- lapply(out, function(x) coef(summary(x))) # obtain summary from linear models
# run bootstrap function on each of the levels of Name (company)
## this may take a while, as we have 50 Names (companies)...
for (i in seq_along(out)) {
ordinary[[i]] <- boot_lm(original_data = my_split[[i]], original_model = out[[i]],
type = 'ordinary', B = 10000) # B is number of bootstrap samples
}
# output
ordinary
输出如下所示:
$TRUE
row.
estimate
lwr
upr
p_value
(Intercept)
15647649
14131807
17268286
0
intr
-64880946
-92974339
-36369417
0
sale_py_at_py
-4520320
-5741252
-3359742
0
R_at_py
-1904298824
-23372044347
-1564292455
0
我对此的问题是:
-你觉得一切都好吗?
-为什么所有 p 值都是 0,我怎样才能看到更详细的 p 值?因为它们不应该是 0
多亏了评论 (@Dion Groothof),我尝试了这种方法。有人可以告诉我我在这种方法中所做的一切是否正确吗?
根据 OP 的要求,这应该是在 他的 特定情况下继续进行的适当方式。
没有必要(我什至反对)更改函数本身的参数。相反,指定适当的字符串和整数作为函数调用中的参数。这应该按如下方式完成。
首先我们指定我们的线性模型。
# linear model
fm0 <- lm(marketingspending ~ intr + inflr + sale_py_at_py+ R_at_py +
+ dt + re + txt , data = Relevante_V03)
然后,我们 运行 函数原样。有关函数参数的更多信息,请参阅 my recent answer.
boot_lm <- function(original_data, original_model,
type = c('ordinary', 'param'),
B = 1000L, seed = 1) {
set.seed(seed)
betas_original_model <- coef(original_model)
len_coef <- length(betas_original_model)
mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
if (type %in% 'ordinary') {
n_rows <- length(residuals(original_model))
for (i in seq_len(B)) {
boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
}
}
if (type %in% 'param') {
X <- model.matrix(delete.response(terms(original_model)),
data = original_data)[, -1L]
for (i in seq_len(B)) {
mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
data = original_data))
}
}
confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
pvals <- numeric(len_coef)
for (i in seq_len(len_coef)) {
pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
}
names(pvals) <- names(betas_original_model)
out <- data.frame(estimate = betas_original_model,
'lwr' = confints[, 1], 'upr' = confints[, 2],
p_value = pvals)
return(out)
}
最后,我们在调用 boot_lm()
时将字符串和整数指定为参数,以使其 tailor-made 用于 您的 特定情况。
# non-parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
type = 'ordinary', B = 1e4, seed = 59385)
# parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
type = 'param', B = 1e4, seed = 59385)
我想执行 bootstrap 线性回归,因为担心正态分布的误差项。我有一个大到足以忽略这一事实的数据集,但我只想仔细检查依赖于分析计算的标准误差的线性回归模型是否产生与从 bootstrapped 线性回归获得的结果相同的结果。
到目前为止,我使用了以下代码:
Rel01 <- subset(
Relevante_Variablen2,
select = c(intr, sale_py_at_py, R_at_py,
inflr, dt, re, txt)
)
x = as.data.frame(sapply(Rel01, as.numeric))
boxplot(x)
library(boot)
i<-nrow(x) #count number of rows for resampling
g<-ncol(x) #count number of columns to step through with bootstrapping
boot.mean<-function(x,i){boot.mean<-mean(x[i])} #bootstrapping function to get the mean
boot_tabl <- apply(x,2,function(y){
b<-boot(y,boot.mean,R=50000);
c(mean(b$t),boot.ci(b,type="perc", conf=0.95)$percent[4:5])
})
View(boot_tabl)
一切似乎都有效(输出在下面显示的 table 中),但我不知道如何解释我的方法的输出。
^ | sale_py_at_py | intr | inflr | dt | re | txt |
---|---|---|---|---|---|---|
1 | 0.0984438 | 0.01223 | 0.04243 | 200 | 400 | 1200 |
2 | 0.0974530 | 0.01191 | 0.04122 | 190 | 300 | 1000 |
3 | 0.0993230 | 0.01291 | 0.04256 | 210 | 405 | 1210 |
我的数据是这样的:
Name | Segment | Sale | Year | Asset | Another header |
---|---|---|---|---|---|
A | 3401 | 10000 | 2000 | 200000 | x |
A | 3401 | 20000 | 2001 | 250000 | x |
B | 2201 | 15000 | 2004 | 280000 | x |
B | 2201 | 23000 | 2009 | 320000 | x |
B | 2201 | 28000 | 2010 | 390000 | x |
C | 2201 | 30000 | 2000 | 210000 | x |
C | 2201 | 18000 | 2004 | 200000 | x |
D | 1 | 28000 | 2000 | 400000 | x |
D | 1 | 38000 | 2001 | 521000 | x |
任何人都可以就如何 bootstrap 我的线性回归分析提供一些指导吗?并告诉我回归的输出到底告诉我什么?
编辑:
original_data = Relevante_V03
original_model = lm01
set.seed(123) # fix random number generator for reproducibility
boot_lm <- function(original_data, original_model,
type = c('ordinary', 'param'),
B = 1000L, seed = 123) {
set.seed(seed)
betas_original_model <- coef(original_model)
len_coef <- length(betas_original_model)
mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
if (type %in% 'ordinary') {
n_rows <- length(residuals(original_model))
for (i in seq_len(B)) {
boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
}
}
if (type %in% 'param') {
X <- model.matrix(delete.response(terms(original_model)),
data = original_data)[, -1L]
for (i in seq_len(B)) {
mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
data = original_data))
}
}
confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
pvals <- numeric(len_coef)
for (i in seq_len(len_coef)) {
pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
}
names(pvals) <- names(betas_original_model)
out <- data.frame(estimate = betas_original_model,
'lwr' = confints[, 1], 'upr' = confints[, 2],
p_value = pvals)
return(out)
}
# linear model to be bootstrapped
my_split <- split(Relevante_V03, Relevante_V03$sic & Relevante_V03$fdyear) # split Relevante_03 by sic(Segment) & (fd)year
out <- lapply(my_split, function(x) {
lm(marketingspending ~ intr + sale_py_at_py + R_at_py, data = x) # perform linear regression on each company separately
})
ordinary <- lapply(out, function(x) coef(summary(x))) # obtain summary from linear models
# run bootstrap function on each of the levels of Name (company)
## this may take a while, as we have 50 Names (companies)...
for (i in seq_along(out)) {
ordinary[[i]] <- boot_lm(original_data = my_split[[i]], original_model = out[[i]],
type = 'ordinary', B = 10000) # B is number of bootstrap samples
}
# output
ordinary
输出如下所示:
$TRUE
row. | estimate | lwr | upr | p_value |
---|---|---|---|---|
(Intercept) | 15647649 | 14131807 | 17268286 | 0 |
intr | -64880946 | -92974339 | -36369417 | 0 |
sale_py_at_py | -4520320 | -5741252 | -3359742 | 0 |
R_at_py | -1904298824 | -23372044347 | -1564292455 | 0 |
我对此的问题是:
-你觉得一切都好吗?
-为什么所有 p 值都是 0,我怎样才能看到更详细的 p 值?因为它们不应该是 0
多亏了评论 (@Dion Groothof),我尝试了这种方法。有人可以告诉我我在这种方法中所做的一切是否正确吗?
根据 OP 的要求,这应该是在 他的 特定情况下继续进行的适当方式。
没有必要(我什至反对)更改函数本身的参数。相反,指定适当的字符串和整数作为函数调用中的参数。这应该按如下方式完成。
首先我们指定我们的线性模型。
# linear model
fm0 <- lm(marketingspending ~ intr + inflr + sale_py_at_py+ R_at_py +
+ dt + re + txt , data = Relevante_V03)
然后,我们 运行 函数原样。有关函数参数的更多信息,请参阅 my recent answer.
boot_lm <- function(original_data, original_model,
type = c('ordinary', 'param'),
B = 1000L, seed = 1) {
set.seed(seed)
betas_original_model <- coef(original_model)
len_coef <- length(betas_original_model)
mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
if (type %in% 'ordinary') {
n_rows <- length(residuals(original_model))
for (i in seq_len(B)) {
boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
}
}
if (type %in% 'param') {
X <- model.matrix(delete.response(terms(original_model)),
data = original_data)[, -1L]
for (i in seq_len(B)) {
mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
data = original_data))
}
}
confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
pvals <- numeric(len_coef)
for (i in seq_len(len_coef)) {
pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
}
names(pvals) <- names(betas_original_model)
out <- data.frame(estimate = betas_original_model,
'lwr' = confints[, 1], 'upr' = confints[, 2],
p_value = pvals)
return(out)
}
最后,我们在调用 boot_lm()
时将字符串和整数指定为参数,以使其 tailor-made 用于 您的 特定情况。
# non-parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
type = 'ordinary', B = 1e4, seed = 59385)
# parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
type = 'param', B = 1e4, seed = 59385)