基于另一列中的值对 R 数据框中的行进行矢量化重新编码
Vectorized recoding of rows in R data frame based on value in another column
我有一个二进制响应 (0, 1) 的数据集,用于解决以下问题。每行代表一个人,每列代表对一个问题的回答。 “已完成”表示完成了多少个问题。例如,如果 completed = 2 只有 q_1 和 q_2 也被那个人回应了。我想对每个“q”列重新评分,例如任何大于“已完成”列的列号为 0,否则使用相应“q”列中的值。
have = data.frame(q_1 = c(1,0,1,1,0),
q_2 = c(1,1,1,1,0),
q_3 = c(0,0,1,1,0),
q_4 = c(1,0,0,1,1),
q_5 = c(1,0,0,0,1),
completed = c(2, 3, 2, 4, 1))
> have
q_1 q_2 q_3 q_4 q_5 completed
1 1 1 0 1 1 2
2 0 1 0 0 0 3
3 1 1 1 0 0 2
4 1 1 1 1 0 4
5 0 0 0 1 1 1
我怎样才能得到这个输出?转换数据集会更容易吗?
> want
q_1 q_2 q_3 q_4 q_5 completed scored_1 scored_2 scored_3 scored_4 scored_5
1 1 1 0 1 1 2 1 1 0 0 0
2 0 1 0 0 0 3 0 1 0 0 0
3 1 1 1 0 0 2 1 1 0 0 0
4 1 1 1 1 0 4 1 1 1 1 0
5 0 0 0 1 1 1 0 0 0 0 0
此代码将得到正确的输出。但是,我的真实数据集非常大,所以我需要能够遍历列。
want = have %>%
mutate(scored_1 = ifelse(completed >= 1, q_1, 0),
scored_2 = ifelse(completed >= 2, q_2, 0),
scored_3 = ifelse(completed >= 3, q_3, 0),
scored_4 = ifelse(completed >= 4, q_4, 0),
scored_5 = ifelse(completed >= 5, q_5, 0))
如果您不需要任何其他逻辑,您可以轻松地将 completed
分数转换为填充列的条件。
library(dplyr)
have %>%
mutate( scored_1 = ifelse(completed != 0, q_1, 0)
,scored_2 = ifelse(completed >= 2, q_2, 0)
,scored_3 = ifelse(completed >= 3, q_3, 0)
,scored_4 = ifelse(completed >= 4, q_4, 0)
,scored_5 = ifelse(completed == 5, q_5, 0)
)
我假设如果“完成”= N,那么这个人实际上写下了最多 N 个问题的答案。对吧?如有不妥请指正
如果是这样的话,我有一个向量化的解决方案:
have = data.frame(q_1 = c(1,0,1,1,0),
q_2 = c(1,1,1,1,0),
q_3 = c(0,0,1,1,0),
q_4 = c(1,0,0,1,1),
q_5 = c(1,0,0,0,1),
completed = c(2, 3, 2, 4, 1))
check <- function(x) {
# which column number is "completed"
stop <- which(names(x) == "completed")
start <- x[["completed"]]
x <- as.numeric(x)
x[(1:length(x) > start) & (1:length(x) < stop)] <- 0
return(x)
}
want <- have
for (line in 1:nrow(want)) {
want[line, ] <- check(want[line, ])
}
您也可以在 dplyr
中尝试 mutate(across(..
。通过这种方式,您可以根据需要改变任意数量的列(例如,名称从 q_
开始)
have = data.frame(q_1 = c(1,0,1,1,0),
q_2 = c(1,1,1,1,0),
q_3 = c(0,0,1,1,0),
q_4 = c(1,0,0,1,1),
q_5 = c(1,0,0,0,1),
completed = c(2, 3, 2, 4, 1))
library(tidyverse)
have %>%
mutate(across(starts_with('q_'), ~ifelse(completed >= as.numeric(str_remove(cur_column(), 'q_')), ., 0),
.names = 'scored_{.col}')) %>%
rename_with(~str_remove(., 'q_'), starts_with('scored'))
#> q_1 q_2 q_3 q_4 q_5 completed scored_1 scored_2 scored_3 scored_4 scored_5
#> 1 1 1 0 1 1 2 1 1 0 0 0
#> 2 0 1 0 0 0 3 0 1 0 0 0
#> 3 1 1 1 0 0 2 1 1 0 0 0
#> 4 1 1 1 1 0 4 1 1 1 1 0
#> 5 0 0 0 1 1 1 0 0 0 0 0
由 reprex package (v2.0.0)
于 2021-05-14 创建
使用col
:
h = have[startsWith(names(have), "q")]
h[col(h) > have$completed] = 0
cbind(have, setNames(h, paste0("s_", 1:ncol(h))))
我有一个二进制响应 (0, 1) 的数据集,用于解决以下问题。每行代表一个人,每列代表对一个问题的回答。 “已完成”表示完成了多少个问题。例如,如果 completed = 2 只有 q_1 和 q_2 也被那个人回应了。我想对每个“q”列重新评分,例如任何大于“已完成”列的列号为 0,否则使用相应“q”列中的值。
have = data.frame(q_1 = c(1,0,1,1,0),
q_2 = c(1,1,1,1,0),
q_3 = c(0,0,1,1,0),
q_4 = c(1,0,0,1,1),
q_5 = c(1,0,0,0,1),
completed = c(2, 3, 2, 4, 1))
> have
q_1 q_2 q_3 q_4 q_5 completed
1 1 1 0 1 1 2
2 0 1 0 0 0 3
3 1 1 1 0 0 2
4 1 1 1 1 0 4
5 0 0 0 1 1 1
我怎样才能得到这个输出?转换数据集会更容易吗?
> want
q_1 q_2 q_3 q_4 q_5 completed scored_1 scored_2 scored_3 scored_4 scored_5
1 1 1 0 1 1 2 1 1 0 0 0
2 0 1 0 0 0 3 0 1 0 0 0
3 1 1 1 0 0 2 1 1 0 0 0
4 1 1 1 1 0 4 1 1 1 1 0
5 0 0 0 1 1 1 0 0 0 0 0
此代码将得到正确的输出。但是,我的真实数据集非常大,所以我需要能够遍历列。
want = have %>%
mutate(scored_1 = ifelse(completed >= 1, q_1, 0),
scored_2 = ifelse(completed >= 2, q_2, 0),
scored_3 = ifelse(completed >= 3, q_3, 0),
scored_4 = ifelse(completed >= 4, q_4, 0),
scored_5 = ifelse(completed >= 5, q_5, 0))
如果您不需要任何其他逻辑,您可以轻松地将 completed
分数转换为填充列的条件。
library(dplyr)
have %>%
mutate( scored_1 = ifelse(completed != 0, q_1, 0)
,scored_2 = ifelse(completed >= 2, q_2, 0)
,scored_3 = ifelse(completed >= 3, q_3, 0)
,scored_4 = ifelse(completed >= 4, q_4, 0)
,scored_5 = ifelse(completed == 5, q_5, 0)
)
我假设如果“完成”= N,那么这个人实际上写下了最多 N 个问题的答案。对吧?如有不妥请指正
如果是这样的话,我有一个向量化的解决方案:
have = data.frame(q_1 = c(1,0,1,1,0),
q_2 = c(1,1,1,1,0),
q_3 = c(0,0,1,1,0),
q_4 = c(1,0,0,1,1),
q_5 = c(1,0,0,0,1),
completed = c(2, 3, 2, 4, 1))
check <- function(x) {
# which column number is "completed"
stop <- which(names(x) == "completed")
start <- x[["completed"]]
x <- as.numeric(x)
x[(1:length(x) > start) & (1:length(x) < stop)] <- 0
return(x)
}
want <- have
for (line in 1:nrow(want)) {
want[line, ] <- check(want[line, ])
}
您也可以在 dplyr
中尝试 mutate(across(..
。通过这种方式,您可以根据需要改变任意数量的列(例如,名称从 q_
开始)
have = data.frame(q_1 = c(1,0,1,1,0),
q_2 = c(1,1,1,1,0),
q_3 = c(0,0,1,1,0),
q_4 = c(1,0,0,1,1),
q_5 = c(1,0,0,0,1),
completed = c(2, 3, 2, 4, 1))
library(tidyverse)
have %>%
mutate(across(starts_with('q_'), ~ifelse(completed >= as.numeric(str_remove(cur_column(), 'q_')), ., 0),
.names = 'scored_{.col}')) %>%
rename_with(~str_remove(., 'q_'), starts_with('scored'))
#> q_1 q_2 q_3 q_4 q_5 completed scored_1 scored_2 scored_3 scored_4 scored_5
#> 1 1 1 0 1 1 2 1 1 0 0 0
#> 2 0 1 0 0 0 3 0 1 0 0 0
#> 3 1 1 1 0 0 2 1 1 0 0 0
#> 4 1 1 1 1 0 4 1 1 1 1 0
#> 5 0 0 0 1 1 1 0 0 0 0 0
由 reprex package (v2.0.0)
于 2021-05-14 创建使用col
:
h = have[startsWith(names(have), "q")]
h[col(h) > have$completed] = 0
cbind(have, setNames(h, paste0("s_", 1:ncol(h))))