从 bootstrap 数据的函数生成图

Producing plots from functions that bootstrap data

考虑这个数据框:

set.seed(123)
dat1 <- data.frame(Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200),
                   var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)

位置 Loc 是每个 ID 上测量值 var1:6 的分组变量。有几对 Loc 彼此非常接近(地理上),它们可能应该被视为一个组而不是两个独立的组。因此,我编写了一个函数,它将 bootstrap 每个变量来查看这些组是否似乎来自相同的分布:

library(tidyverse)
BootT <- function(dat, var, gv1, gv2){
  set.seed(123)
  a<- dplyr::filter(dat, Loc == gv1)
  a2 <- dplyr::select(a, var)
  b <- dplyr::filter(dat, Loc == gv2)
  b2 <- dplyr::select(b, var)
  pooled <- rbind(a2, b2)
  boot.t <- c(1:999)
  for(i in 1:999){
    sample.index <- sample(c(1:length(pooled[,1])), replace = TRUE)
    sample.x <- pooled[sample.index,][1:length(a2[,1])]
    sample.y <- pooled[sample.index,][-c(1:length(b2[,1]))]
    boot.t[i] <- t.test(sample.x, sample.y)$statistic
  }
  p.pooled <-  data.frame(p.pooled = 1 + sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic))) / (999+1) 
 return(p.pooled)
  ids <- data.frame(Group1 = paste0(gv1), Group2 = paste0(gv2), Variable = paste0(var))
  p.pooled <- p.pooled%>%
    dplyr::mutate(Group1 = ids[,1], Group2 = ids[,2], Variable = ids[,3])
  p.pooled <- p.pooled[,c(2,3,4,1)]
 return(p.pooled)
}
#compare 2 locs of interest with a single variable
BootT(dat = dat1, var = "var2", gv1 = "a", gv2 = "g") 
#compare all 6 variables 
vars <- names(dat1[,3:8])
results <- list()
for(i in vars){
  res <- BootT(dat = dat1, var = i, gv1 = "a", gv2 = "b")
  results <- rbind(results, res)
} 

我想修改此函数,使其输出一个经典的直方图,显示每个变量与观察值的 bootstrapped 分布,并在图上包含摘要统计信息。我怎样才能修改这个功能来完成这个? 编辑: 最初,我打算使用启动包来执行此操作,这样会更容易,但我不确定我是否理解不同的参数将如何改变采样过程。在两个 Loc 具有相等方差(通过 F 检验评估)的情况下,我想像上面演示的那样对合并样本进行抽样。但是,当样本是异质的时,我想在创建要比较的合并样本之前减去每个组的均值(这迫使原假设为真,并且不假设方差均匀)。有关详细信息,请参阅此 post:https://stats.stackexchange.com/questions/136661/using-bootstrap-under-h0-to-perform-a-test-for-the-difference-of-two-means-repl

实际上我做了一个与上面的函数非常相似的函数(使用另一个非常原始的名称)来处理存在异质方差问题的情况:

BootT2 <- function(dat, var, gv1, gv2){
  set.seed(123)
  a<- dplyr::filter(dat, Loc == gv1)
  a2 <- dplyr::select(a, var)
  b <- dplyr::filter(dat, Loc == gv2)
  b2 <- dplyr::select(b, var)
  pooled <- rbind(a2,b2)
  xt <- a2[,1] - mean(a2[,1]) + mean(pooled[,1])
  yt <- b2[,1] - mean(b2[,1]) + mean(pooled[,1])
  boot.t <- c(1:999)
  for(i in 1:999){
    sample.x <- sample(xt, replace = T)
    sample.y <- sample(yt, replace = T)
    boot.t[i] <- t.test(sample.x, sample.y)$statistic
  }
  p.h0 <- data.frame(p.ho = (1+sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic)) / 999+1)-2)
  #p.h0 <- data.frame(p.ho = sum(abs(boot.t) > abs(t.test(a[,var],b[,var])$statistic)) / 999)
  ids <- data.frame(Group1 = paste0(gv1), Group2 = paste0(gv2), Variable = paste0(var))
  p.h0 <- p.h0%>%
    mutate(Group1 = ids[,1], Group2 = ids[,2], Variable = ids[,3])
  p.h0 <- p.h0[,c(2,3,4,1)]
 return(p.h0)
}
#compare 2 locs of interest with a single variable
BootT2(dat = dat1, var = "var2", gv1 = "a", gv2 = "g") 
#compare all 6 variables 
vars <- names(dat1[,3:8])
results.bootT2 <- list()
for(i in vars){
  res <- BootT2(dat = dat1, var = i, gv1 = "a", gv2 = "b")
  results.bootT2 <- rbind(results.bootT2, res)
} 

如果有人想解释我如何执行这些程序并使用 boot() 包生成绘图,那就太好了。

如果我理解正确,以下将 运行 bootstrapped t 检验数据集 dat1 中变量 var 的 2 Loc .它在函数 bootTstat 中使用 accepted answer to this CrossValidated post bootstrap,但这是从函数 funBoot 中调用的。函数 funBoot 负责对组 gv1gv2 行和列 var 进行子集化。这样形成的数据集被传递给bootTstat.

bootTstat <- function(x, y, R){
  pool <- c(x, y)
  xt <- x - mean(x) + mean(pool)
  yt <- y - mean(y) + mean(pool)
  boot.t <- numeric(R)
  for (i in seq_len(R)){
    sample.x <- sample(xt, replace = TRUE)
    sample.y <- sample(yt, replace = TRUE)
    boot.t[i] <- t.test(sample.x, sample.y)$statistic
  }
  p.h0 <- (1 + sum(abs(boot.t) > abs(t.test(x, y)$statistic))) / (R + 1)  
  list(
    statistic = boot.t,
    p.value = p.h0
  )
}

funBoot <- function(data, R, var, gv1, gv2){
  i <- data[["Loc"]] == gv1
  j <- data[["Loc"]] == gv2
  x <- data[i, var]
  y <- data[j, var]
  bootTstat(x, y, R)
}

对于 "var2" 和组 "a""g" 运行 使用整个组数据和 R = 1000 测试的 t 检验。

首先是 t 检验。

a <- subset(dat1, Loc == 'a', select = 'var2')
g <- subset(dat1, Loc == 'g', select = 'var2')
t.test(a, g)
#
#        Welch Two Sample t-test
#
#data:  a and g
#t = 1.1002, df = 47, p-value = 0.2769
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -0.2585899  0.8828038
#sample estimates:
# mean of x  mean of y 
# 0.1755209 -0.1365860 

以及启动的 t 检验。 R <- 1000 set.seed(123)

b_ag <- funBoot(dat1, R, var = "var2", gv1 = "a", gv2 = "g")
b_ag$p.value
#[1] 0.2737263

此 p 值与之前获得的 p.value = 0.2769 相似。
并且直方图可以很容易地绘制出来。

hist(b_ag$statistic, main = "Bootstrapped t-test")

现在 运行 测试所有变量和组 "a""b"。使用包 ggplot2.

绘图
ttest_list <- lapply(names(dat1)[3:8], function(v) {
  b <- funBoot(data = dat1, R = R, var = v, gv1 = "a", gv2 = "b")
  list(
    p.value = b$p.value,
    test = data.frame(var = v, stat = b$statistic)
  )
})

ttest_df <- lapply(ttest_list, '[[', 'test')
ttest_df <- do.call(rbind, ttest_df)

library(ggplot2)

ggplot(ttest_df, aes(stat)) +
  geom_histogram(bins = 25) +
  facet_wrap(~ var)