在 R 中引导多个变量

Question

我想执行 bootstrap 线性回归，因为担心正态分布的误差项。我有一个大到足以忽略这一事实的数据集，但我只想仔细检查依赖于分析计算的标准误差的线性回归模型是否产生与从 bootstrapped 线性回归获得的结果相同的结果。

到目前为止，我使用了以下代码：

Rel01 <- subset(
  Relevante_Variablen2,
  select = c(intr, sale_py_at_py, R_at_py,
             inflr, dt, re, txt)
)

x = as.data.frame(sapply(Rel01, as.numeric))
boxplot(x)

library(boot)

i<-nrow(x) #count number of rows for resampling 
g<-ncol(x) #count number of columns to step through with bootstrapping
boot.mean<-function(x,i){boot.mean<-mean(x[i])} #bootstrapping function to get the mean

boot_tabl <- apply(x,2,function(y){ 
  b<-boot(y,boot.mean,R=50000); 
  c(mean(b$t),boot.ci(b,type="perc", conf=0.95)$percent[4:5])
})

View(boot_tabl)

一切似乎都有效（输出在下面显示的 table 中），但我不知道如何解释我的方法的输出。

^	sale_py_at_py	intr	inflr	dt	re	txt
1	0.0984438	0.01223	0.04243	200	400	1200
2	0.0974530	0.01191	0.04122	190	300	1000
3	0.0993230	0.01291	0.04256	210	405	1210

我的数据是这样的：

Name	Segment	Sale	Year	Asset	Another header
A	3401	10000	2000	200000	x
A	3401	20000	2001	250000	x
B	2201	15000	2004	280000	x
B	2201	23000	2009	320000	x
B	2201	28000	2010	390000	x
C	2201	30000	2000	210000	x
C	2201	18000	2004	200000	x
D	1	28000	2000	400000	x
D	1	38000	2001	521000	x

任何人都可以就如何 bootstrap 我的线性回归分析提供一些指导吗？并告诉我回归的输出到底告诉我什么？

编辑：

original_data = Relevante_V03
original_model = lm01
set.seed(123) # fix random number generator for reproducibility

boot_lm <- function(original_data, original_model,
                    type = c('ordinary', 'param'),
                    B = 1000L, seed = 123) {
  set.seed(seed)
  betas_original_model <- coef(original_model)
  len_coef <- length(betas_original_model)
  mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
  if (type %in% 'ordinary') {
    n_rows <- length(residuals(original_model))
    for (i in seq_len(B)) {
      boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
      mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
    }
  }
  if (type %in% 'param') {
    X <- model.matrix(delete.response(terms(original_model)),
                      data = original_data)[, -1L]
    for (i in seq_len(B)) {
      mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
                          data = original_data))
    }
  }
  confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
  pvals <- numeric(len_coef)
  for (i in seq_len(len_coef)) {
    pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
    confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
  }
  names(pvals) <- names(betas_original_model)
  out <- data.frame(estimate = betas_original_model,
                    'lwr' = confints[, 1], 'upr' = confints[, 2],
                    p_value = pvals)
  return(out)
}


# linear model to be bootstrapped
my_split <- split(Relevante_V03, Relevante_V03$sic & Relevante_V03$fdyear) # split Relevante_03 by sic(Segment) & (fd)year 
out <- lapply(my_split, function(x) {
  lm(marketingspending ~ intr + sale_py_at_py + R_at_py, data = x) # perform linear regression on each company separately
})
ordinary <- lapply(out, function(x) coef(summary(x))) # obtain summary from linear models

# run bootstrap function on each of the levels of Name (company)
## this may take a while, as we have 50 Names (companies)...
for (i in seq_along(out)) {
  ordinary[[i]] <- boot_lm(original_data = my_split[[i]], original_model = out[[i]],
                           type = 'ordinary', B = 10000) # B is number of bootstrap samples
} 

# output
ordinary

输出如下所示：

$TRUE

row.	estimate	lwr	upr
(Intercept)	15647649	14131807	17268286
intr	-64880946	-92974339	-36369417
sale_py_at_py	-4520320	-5741252	-3359742
R_at_py	-1904298824	-23372044347	-1564292455

我对此的问题是：

-你觉得一切都好吗？

-为什么所有 p 值都是 0，我怎样才能看到更详细的 p 值？因为它们不应该是 0

多亏了评论 (@Dion Groothof)，我尝试了这种方法。有人可以告诉我我在这种方法中所做的一切是否正确吗？

Answer 1

根据 OP 的要求，这应该是在他的特定情况下继续进行的适当方式。

没有必要（我什至反对）更改函数本身的参数。相反，指定适当的字符串和整数作为函数调用中的参数。这应该按如下方式完成。

首先我们指定我们的线性模型。

# linear model
fm0 <- lm(marketingspending ~ intr + inflr + sale_py_at_py+ R_at_py +
            + dt + re + txt , data = Relevante_V03)

然后，我们运行函数原样。有关函数参数的更多信息，请参阅 my recent answer.

boot_lm <- function(original_data, original_model,
                    type = c('ordinary', 'param'),
                    B = 1000L, seed = 1) {
  set.seed(seed)
  betas_original_model <- coef(original_model)
  len_coef <- length(betas_original_model)
  mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
  if (type %in% 'ordinary') {
    n_rows <- length(residuals(original_model))
    for (i in seq_len(B)) {
      boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
      mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
    }
  }
  if (type %in% 'param') {
    X <- model.matrix(delete.response(terms(original_model)),
                      data = original_data)[, -1L]
    for (i in seq_len(B)) {
      mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
                          data = original_data))
    }
  }
  confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
  pvals <- numeric(len_coef)
  for (i in seq_len(len_coef)) {
    pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
    confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
  }
  names(pvals) <- names(betas_original_model)
  out <- data.frame(estimate = betas_original_model,
                    'lwr' = confints[, 1], 'upr' = confints[, 2],
                    p_value = pvals)
  return(out)
}

最后，我们在调用 boot_lm() 时将字符串和整数指定为参数，以使其 tailor-made 用于您的特定情况。

# non-parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
        type = 'ordinary', B = 1e4, seed = 59385)

# parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
        type = 'param', B = 1e4, seed = 59385)

在 R 中引导多个变量

Bootstrapping multiple variables in R

r

linear-regression

resampling