项目数量过多时检查项目各种组合的最佳算法

Optimum algorithm to check various combinations of items when number of items is too large

我有一个数据框,其中包含 20 columns/items 和 593 行(虽然行数无关紧要),如下所示:

借助 psych 包 psych::alpha 中的 alpha,使用此方法获得的测试可靠性为 0.94。如果我放下其中一项,输出还会为我提供 cronbach 的 alpha 的新值。但是,我想知道我可以删除多少项目以保留至少 0.8 的 alpha 我使用蛮力方法来创建数据框中存在的所有项目的组合并检查它们的 alpha在 (0.7,0.9) 范围内。有没有更好的方法来做到这一点,因为这会永远 运行 因为项目数量太大而无法检查所有项目组合。下面是我当前的一段代码:

numberOfItems <- 20
for(i in 2:(2^numberOfItems)-1){
  # ignoring the first case i.e. i=1, as it doesn't represent any model
  # convert the value of i to binary, e.g. i=5 will give combination = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
  # using the binaryLogic package
  combination <- as.binary(i, n=numberOfItems) 
  model <- c()
  for(j in 1:length(combination)){
    # choose which columns to consider depending on the combination
    if(combination[j])
      model <- c(model, j)
  }
  itemsToUse <- itemResponses[, c(model)]
  #cat(model)
  if(length(model) > 13){
    alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
    if(alphaVal > 0.7 && alphaVal < 0.9){
      cat(alphaVal)
      print(model)
    }
  }
}

此代码的示例输出如下:

0.8989831 1 4 5 7 8 9 10 11 13 14 15 16 17 19 20

0.899768 1 4 5 7 8 9 10 11 12 13 15 17 18 19 20

0.899937 1 4 5 7 8 9 10 11 12 13 15 16 17 19 20

0.8980605 1 4 5 7 8 9 10 11 12 13 14 15 17 19 20

这是前 10 行数据:

dput(itemResponses) structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2, 3, 1, 0, 0, 1, 1, 1, 0, 1), CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0, 0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0), CESD5 = c(0, 1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0, 0, 0), CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1, 3, 1, 1, 0, 1, 0, 0, 1, 0), CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD11 = c(0, 2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0, 0, 0), CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0, 3, 1, 2, 1, 1, 1, 0, 1, 1), CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1, 1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0), CESD17 = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0, 0, 1), CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0, 3, 0, 1, 0, 0, 0, 0, 0, 0)), .Names = c("CESD1", "CESD2", "CESD3", "CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9", "CESD10", "CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17", "CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))

我按如下方式更改了代码,现在我删除了固定数量的项目并手动将 numberOfItemsToDrop 的值从 1 更改为 20。虽然它稍微好一点,但是 运行 仍然花费太长时间 :(

我希望有更好的方法来做到这一点。

numberOfItemsToDrop <- 13
combinations <- combinat::combn(20, numberOfItemsToDrop)
timesToIterate <- length(combinations)/numberOfItemsToDrop
for(i in 1:timesToIterate){
  model <- combinations[,i]
  itemsToUse <- itemResponses[, -c(model)]
  alphaVal <- psych::alpha(itemsToUse)$total$raw_alpha
  if(alphaVal < 0.82){
    cat("Cronbach's alpha =",alphaVal, ", number of items dropped = ", length(model), " :: ")
    print(model)
  }
}

这个想法是用所谓的 discrimination 代替经典测试理论 (CTT) 中每个项目的 alpha 计算。区分是项目分数与 "true score"(我们假设为行总和)的相关性。

设数据为

dat <-  structure(list(CESD1 = c(1, 2, 2, 0, 1, 0, 0, 0, 0, 1), CESD2 = c(2, 3, 1, 0, 0, 1, 1, 1, 0, 1), 
                       CESD3 = c(0, 3, 0, 1, 1, 0, 0, 0, 0, 0), CESD4 = c(1, 2, 0, 1, 0, 1, 1, 1, 0, 0), 
                       CESD5 = c(0, 1, 0, 2, 1, 2, 2, 0, 0, 0), CESD6 = c(0, 3, 0, 1, 0, 0, 2, 0, 0, 0), 
                       CESD7 = c(1, 2, 1, 1, 2, 0, 1, 0, 1, 0), CESD8 = c(1, 3, 1, 1, 0, 1, 0, 0, 1, 0), 
                       CESD9 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), CESD10 = c(0, 1, 0, 2, 0, 0, 1, 1, 0, 1), 
                       CESD11 = c(0, 2, 1, 1, 1, 1, 2, 3, 0, 0), CESD12 = c(0, 3, 1, 1, 1, 0, 2, 0, 0, 0), 
                       CESD13 = c(0, 3, 0, 2, 1, 2, 1, 0, 1, 0), CESD14 = c(0, 3, 1, 2, 1, 1, 1, 0, 1, 1), 
                       CESD15 = c(0, 2, 0, 1, 0, 1, 0, 1, 1, 0), CESD16 = c(0, 2, 2, 0, 0, 1, 1, 0, 0, 0), 
                       CESD17 = c(0, 0, 0, 0, 0, 1, 1, 0, 0, 0), CESD18 = c(0, 2, 0, 0, 0, 0, 0, 0, 0, 1), 
                       CESD19 = c(0, 3, 0, 0, 0, 0, 0, 1, 1, 0), CESD20 = c(0, 3, 0, 1, 0, 0, 0, 0, 0, 0)), 
                  .Names = c("CESD1", "CESD2", "CESD3", "CESD4", "CESD5", "CESD6", "CESD7", "CESD8", "CESD9", 
                             "CESD10", "CESD11", "CESD12", "CESD13", "CESD14", "CESD15", "CESD16", "CESD17", 
                             "CESD18", "CESD19", "CESD20"), row.names = c(NA, -10L), 
                  class = c("tbl_df", "tbl", "data.frame"))

我们计算 (1) 辨别力和 (2) α 系数。

stat <- t(sapply(1:ncol(dat), function(ii){
  dd <- dat[, ii]
  # discrimination is the correlation of the item to the rowsum
  disc <- if(var(dd, na.rm = TRUE) > 0) cor(dd, rowSums(dat[, -ii]), use = "pairwise")
  # alpha that would be obtained when we skip this item
  alpha <- psych::alpha(dat[, -ii])$total$raw_alpha
  c(disc, alpha)
  }))
dimnames(stat) <- list(colnames(dat), c("disc", "alpha^I"))
stat <- data.frame(stat)

观察到区分度(计算效率更高)与删除此项时获得的 alpha 成反比。换句话说,当有许多高 "discriminating" 项(相互关联)时,alpha 最高。

plot(stat, pch = 19)

使用此信息 select 应删除项目以低于基准(例如 .9,因为玩具数据不允许更低的分数)的顺序:

1) 删除尽可能多的项目以保持在基准之上;也就是说,从区分度最低的项目开始。

stat <- stat[order(stat$disc), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
  ind <- match(rownames(stat)[1:ii], colnames(dat))
  alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})

delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)

2) 删除尽可能少的项目以保持在基准之上;也就是说,从区分度最高的项目开始。

stat <- stat[order(stat$disc, decreasing = TRUE), ]
this <- sapply(1:(nrow(stat)-2), function(ii){
  ind <- match(rownames(stat)[1:ii], colnames(dat))
  alpha <- psych::alpha(dat[, -ind, drop = FALSE])$total$raw_alpha
})

delete_these <- rownames(stat)[which(this > .9)]
psych::alpha(dat[, -match(delete_these, colnames(dat)), drop = FALSE])$total$raw_alpha
length(delete_these)

请注意,1) 与 (psychological/educational) 中的经典项目 selection 过程一致 ​​diagnostic/assessments:从评估,在歧视力方面低于基准。