在 R 中引导多个变量

Bootstrapping multiple variables in R

我想执行 bootstrap 线性回归,因为担心正态分布的误差项。我有一个大到足以忽略这一事实的数据集,但我只想仔细检查依赖于分析计算的标准误差的线性回归模型是否产生与从 bootstrapped 线性回归获得的结果相同的结果。

到目前为止,我使用了以下代码:

Rel01 <- subset(
  Relevante_Variablen2,
  select = c(intr, sale_py_at_py, R_at_py,
             inflr, dt, re, txt)
)

x = as.data.frame(sapply(Rel01, as.numeric))
boxplot(x)

library(boot)

i<-nrow(x) #count number of rows for resampling 
g<-ncol(x) #count number of columns to step through with bootstrapping
boot.mean<-function(x,i){boot.mean<-mean(x[i])} #bootstrapping function to get the mean

boot_tabl <- apply(x,2,function(y){ 
  b<-boot(y,boot.mean,R=50000); 
  c(mean(b$t),boot.ci(b,type="perc", conf=0.95)$percent[4:5])
})

View(boot_tabl) 

一切似乎都有效(输出在下面显示的 table 中),但我不知道如何解释我的方法的输出。

^ sale_py_at_py intr inflr dt re txt
1 0.0984438 0.01223 0.04243 200 400 1200
2 0.0974530 0.01191 0.04122 190 300 1000
3 0.0993230 0.01291 0.04256 210 405 1210

我的数据是这样的:

Name Segment Sale Year Asset Another header
A 3401 10000 2000 200000 x
A 3401 20000 2001 250000 x
B 2201 15000 2004 280000 x
B 2201 23000 2009 320000 x
B 2201 28000 2010 390000 x
C 2201 30000 2000 210000 x
C 2201 18000 2004 200000 x
D 1 28000 2000 400000 x
D 1 38000 2001 521000 x

任何人都可以就如何 bootstrap 我的线性回归分析提供一些指导吗?并告诉我回归的输出到底告诉我什么?

编辑:

original_data = Relevante_V03
original_model = lm01
set.seed(123) # fix random number generator for reproducibility

boot_lm <- function(original_data, original_model,
                    type = c('ordinary', 'param'),
                    B = 1000L, seed = 123) {
  set.seed(seed)
  betas_original_model <- coef(original_model)
  len_coef <- length(betas_original_model)
  mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
  if (type %in% 'ordinary') {
    n_rows <- length(residuals(original_model))
    for (i in seq_len(B)) {
      boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
      mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
    }
  }
  if (type %in% 'param') {
    X <- model.matrix(delete.response(terms(original_model)),
                      data = original_data)[, -1L]
    for (i in seq_len(B)) {
      mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
                          data = original_data))
    }
  }
  confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
  pvals <- numeric(len_coef)
  for (i in seq_len(len_coef)) {
    pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
    confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
  }
  names(pvals) <- names(betas_original_model)
  out <- data.frame(estimate = betas_original_model,
                    'lwr' = confints[, 1], 'upr' = confints[, 2],
                    p_value = pvals)
  return(out)
}


# linear model to be bootstrapped
my_split <- split(Relevante_V03, Relevante_V03$sic & Relevante_V03$fdyear) # split Relevante_03 by sic(Segment) & (fd)year 
out <- lapply(my_split, function(x) {
  lm(marketingspending ~ intr + sale_py_at_py + R_at_py, data = x) # perform linear regression on each company separately
})
ordinary <- lapply(out, function(x) coef(summary(x))) # obtain summary from linear models

# run bootstrap function on each of the levels of Name (company)
## this may take a while, as we have 50 Names (companies)...
for (i in seq_along(out)) {
  ordinary[[i]] <- boot_lm(original_data = my_split[[i]], original_model = out[[i]],
                           type = 'ordinary', B = 10000) # B is number of bootstrap samples
} 

# output
ordinary

输出如下所示:

$TRUE

row. estimate lwr upr p_value
(Intercept) 15647649 14131807 17268286 0
intr -64880946 -92974339 -36369417 0
sale_py_at_py -4520320 -5741252 -3359742 0
R_at_py -1904298824 -23372044347 -1564292455 0

我对此的问题是:

-你觉得一切都好吗?

-为什么所有 p 值都是 0,我怎样才能看到更详细的 p 值?因为它们不应该是 0

多亏了评论 (@Dion Groothof),我尝试了这种方法。有人可以告诉我我在这种方法中所做的一切是否正确吗?

根据 OP 的要求,这应该是在 他的 特定情况下继续进行的适当方式。

没有必要(我什至反对)更改函数本身的参数。相反,指定适当的字符串和整数作为函数调用中的参数。这应该按如下方式完成。

首先我们指定我们的线性模型。

# linear model
fm0 <- lm(marketingspending ~ intr + inflr + sale_py_at_py+ R_at_py +
            + dt + re + txt , data = Relevante_V03)

然后,我们 运行 函数原样。有关函数参数的更多信息,请参阅 my recent answer.

boot_lm <- function(original_data, original_model,
                    type = c('ordinary', 'param'),
                    B = 1000L, seed = 1) {
  set.seed(seed)
  betas_original_model <- coef(original_model)
  len_coef <- length(betas_original_model)
  mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
  if (type %in% 'ordinary') {
    n_rows <- length(residuals(original_model))
    for (i in seq_len(B)) {
      boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
      mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
    }
  }
  if (type %in% 'param') {
    X <- model.matrix(delete.response(terms(original_model)),
                      data = original_data)[, -1L]
    for (i in seq_len(B)) {
      mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
                          data = original_data))
    }
  }
  confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
  pvals <- numeric(len_coef)
  for (i in seq_len(len_coef)) {
    pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
    confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
  }
  names(pvals) <- names(betas_original_model)
  out <- data.frame(estimate = betas_original_model,
                    'lwr' = confints[, 1], 'upr' = confints[, 2],
                    p_value = pvals)
  return(out)
}

最后,我们在调用 boot_lm() 时将字符串和整数指定为参数,以使其 tailor-made 用于 您的 特定情况。

# non-parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
        type = 'ordinary', B = 1e4, seed = 59385)

# parametric bootstrap
boot_lm(original_data = Relevante_V03, original_model = fm0,
        type = 'param', B = 1e4, seed = 59385)