基于另一列中的值对 R 数据框中的行进行矢量化重新编码

Vectorized recoding of rows in R data frame based on value in another column

我有一个二进制响应 (0, 1) 的数据集,用于解决以下问题。每行代表一个人,每列代表对一个问题的回答。 “已完成”表示完成了多少个问题。例如,如果 completed = 2 只有 q_1 和 q_2 也被那个人回应了。我想对每个“q”列重新评分,例如任何大于“已完成”列的列号为 0,否则使用相应“q”列中的值。

have = data.frame(q_1 = c(1,0,1,1,0),
                      q_2 = c(1,1,1,1,0),
                      q_3 = c(0,0,1,1,0),
                      q_4 = c(1,0,0,1,1),
                      q_5 = c(1,0,0,0,1),
                      completed = c(2, 3, 2, 4, 1))

> have
  q_1 q_2 q_3 q_4 q_5 completed
1   1   1   0   1   1         2
2   0   1   0   0   0         3
3   1   1   1   0   0         2
4   1   1   1   1   0         4
5   0   0   0   1   1         1

我怎样才能得到这个输出?转换数据集会更容易吗?

> want
  q_1 q_2 q_3 q_4 q_5 completed scored_1 scored_2 scored_3 scored_4 scored_5
1   1   1   0   1   1         2        1        1        0        0        0
2   0   1   0   0   0         3        0        1        0        0        0
3   1   1   1   0   0         2        1        1        0        0        0
4   1   1   1   1   0         4        1        1        1        1        0
5   0   0   0   1   1         1        0        0        0        0        0

此代码将得到正确的输出。但是,我的真实数据集非常大,所以我需要能够遍历列。

want = have %>%
  mutate(scored_1 = ifelse(completed >= 1, q_1, 0),
        scored_2 = ifelse(completed >= 2, q_2, 0),
        scored_3 = ifelse(completed >= 3, q_3, 0),
        scored_4 = ifelse(completed >= 4, q_4, 0),
        scored_5 = ifelse(completed >= 5, q_5, 0))

如果您不需要任何其他逻辑,您可以轻松地将 completed 分数转换为填充列的条件。

library(dplyr)

have %>% 
    mutate( scored_1 = ifelse(completed != 0,  q_1, 0)
            ,scored_2 = ifelse(completed >= 2, q_2, 0)
            ,scored_3 = ifelse(completed >= 3, q_3, 0)
            ,scored_4 = ifelse(completed >= 4, q_4, 0)
            ,scored_5 = ifelse(completed == 5, q_5, 0)
    )

我假设如果“完成”= N,那么这个人实际上写下了最多 N 个问题的答案。对吧?如有不妥请指正

如果是这样的话,我有一个向量化的解决方案:

have = data.frame(q_1 = c(1,0,1,1,0),
                      q_2 = c(1,1,1,1,0),
                      q_3 = c(0,0,1,1,0),
                      q_4 = c(1,0,0,1,1),
                      q_5 = c(1,0,0,0,1),
                      completed = c(2, 3, 2, 4, 1))

check <- function(x) {
    # which column number is "completed"
    stop <- which(names(x) == "completed")
    start <- x[["completed"]]
    x <- as.numeric(x)
    x[(1:length(x) > start) & (1:length(x) < stop)] <- 0
    return(x)
}

want <- have
for (line in 1:nrow(want)) {
    want[line, ] <- check(want[line, ])
}

您也可以在 dplyr 中尝试 mutate(across(..。通过这种方式,您可以根据需要改变任意数量的列(例如,名称从 q_ 开始)

have = data.frame(q_1 = c(1,0,1,1,0),
                  q_2 = c(1,1,1,1,0),
                  q_3 = c(0,0,1,1,0),
                  q_4 = c(1,0,0,1,1),
                  q_5 = c(1,0,0,0,1),
                  completed = c(2, 3, 2, 4, 1))
library(tidyverse)

have %>%
  mutate(across(starts_with('q_'), ~ifelse(completed >= as.numeric(str_remove(cur_column(), 'q_')), ., 0),
                .names = 'scored_{.col}')) %>%
  rename_with(~str_remove(., 'q_'), starts_with('scored'))
#>   q_1 q_2 q_3 q_4 q_5 completed scored_1 scored_2 scored_3 scored_4 scored_5
#> 1   1   1   0   1   1         2        1        1        0        0        0
#> 2   0   1   0   0   0         3        0        1        0        0        0
#> 3   1   1   1   0   0         2        1        1        0        0        0
#> 4   1   1   1   1   0         4        1        1        1        1        0
#> 5   0   0   0   1   1         1        0        0        0        0        0

reprex package (v2.0.0)

于 2021-05-14 创建

使用col:

h = have[startsWith(names(have), "q")]
h[col(h) > have$completed] = 0
cbind(have, setNames(h, paste0("s_", 1:ncol(h))))