对向量进行多次随机抽样以分组并进行方差分析

random sample a vector multiple times to make groups and conduct ANOVA

我有 50 个随机生成的数字和我设置的参数。我想将这50个随机数随机抽成10组5个(不放回)

我想将这 10 个组存储为 matrix/dataframe 和 运行 对这些组进行方差分析测试,然后将整个过程重复 1000 次,存储每次迭代的 F、F 临界值和 P 值.

我有以下

samp <- rnorm(50,3.47,0.0189) # 50 samples, mean of 3.47 and SD of 0.0189

for (i in 1:10){
  x <- sample(samp, 5, replace = F)
}

x <- #all my random samples

我通常在数据位于列表中且第二列标识组时使用的方差分析代码

Samp_lm <- lm(Samp_lm ~ factor(group), data = x) 
AnovaResults <- anova(Samp_lm)

criticalValues <- cbind(AnovaResults, 'F Critical Value' = qf(1 - 0.05, test.Aov[1, 1], test.Aov[2, 1]))
AnovaStats <- cbind(criticalValues[1,4],criticalValues[1,5],criticalValues[1,6]

不知道从这里到哪里去。

以下是我如何使用 dplyrpurrr 包重构您的代码。

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
set.seed(345)
samp <- rnorm(50,3.47,0.0189)

do_sampling <- function(vec, n_groups, iterations = 1000){
  #hacky 
  group_size <- length(vec)/n_groups
  if(group_size%%1!=0) stop("group sizes are uneven")
  
  #from purrr
  map_dfr(1:iterations, function(i){
    data <- tibble(
      samp = vec,
      groups = factor(sample(rep(1:group_size, each = n_groups)))
    )
    
    samp_lm <- lm(samp ~ groups, data = data)
    AnovaResults <- anova(samp_lm)
    bind_cols(
      as_tibble(AnovaResults[1,c("F value","Pr(>F)")]),
      tibble(
        `F Critical Value` = qf( 1 - 0.05, AnovaResults[1,1], AnovaResults[2,1]),
        iteration = i
      )
    )
  })
}

do_sampling(samp, 10)
#> # A tibble: 1,000 x 4
#>    `F value` `Pr(>F)` `F Critical Value` iteration
#>        <dbl>    <dbl>              <dbl>     <int>
#>  1     0.117  0.976                 2.58         1
#>  2     0.445  0.775                 2.58         2
#>  3     1.12   0.359                 2.58         3
#>  4     0.914  0.464                 2.58         4
#>  5     5.04   0.00192               2.58         5
#>  6     0.964  0.437                 2.58         6
#>  7     1.19   0.327                 2.58         7
#>  8     1.77   0.151                 2.58         8
#>  9     0.399  0.808                 2.58         9
#> 10     0.955  0.441                 2.58        10
#> # … with 990 more rows

reprex package (v1.0.0)

于 2021-05-11 创建

最后看看infer package's vignette on anova。可能对你有帮助

由于您要重复随机抽样,因此您应该首先制作一个函数来执行您想要的操作:

SimAnova <- function() {
     Groups <-rep(LETTERS[1:10], each=5)
     Values <- rnorm(50, 3.47, 0.0189)
     AnovaResults <- anova(lm(Values~Groups))
     F <- AnovaResults[1, 4]
     df <- AnovaResults[, 1]
     Crit <- qf(1 - .05, df[1], df[2])
     P <- AnovaResults[1, 5]
     c("F-Value"=F, "Critical F-Value" =Crit, "P-Value"=P)
}
SimAnova()
#          F-Value Critical F-Value          P-Value 
#        1.7350592        2.1240293        0.1126789 
SimAnova()
#          F-Value Critical F-Value          P-Value 
#       2.04024282       2.12402926       0.05965209 
SimAnova()
#          F-Value Critical F-Value          P-Value 
#        1.635386         2.124029         0.138158 

现在只需重复 1000 次即可:

result <- t(replicate(1000, SimAnova()))
head(result)
#        F-Value Critical F-Value   P-Value
# [1,] 0.5659946         2.124029 0.8164247
# [2,] 0.7717596         2.124029 0.6427732
# [3,] 0.8377358         2.124029 0.5862101
# [4,] 1.6284143         2.124029 0.1401280
# [5,] 0.2191311         2.124029 0.9899751
# [6,] 0.2744286         2.124029 0.9780476

请注意,您实际上不需要保存临界 F 值,因为它对于每个样本都是相同的。