R:将处理三个二进制变量的 mutate 调用调整为 n 个二进制变量
R: adapt mutate call from handling three binary variables to n binary variables
我有一个数据框,其中包含 3 个与时间段 1 相关的二进制变量和三个与时间 2 相关的相应变量。
df <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,0,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1))
df
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2
1 a 1 1 0 1 1 0
2 b 0 1 0 0 0 0
3 c 0 1 1 0 0 1
4 d 0 0 1 0 0 0
5 e NA NA 0 NA NA 1
我想知道对于给定的 item
,在第 1 期观察是否有 1
,但在第 2 期没有。此外,我想知道观察是否有任何一个项目在第 1 期而不是第 2 期 1
的实例
所以理想的输出看起来像
df2 <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,1,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1), "item_1_check" = c(1,1,1,1,1), "item_2_check" = c(1,0,0,1,1), "item_3_check" = c(1,1,1,0,1), item_check = c(1,0,0,0,1))
df2
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 item_1_check item_2_check item_3_check item_check
1 a 1 1 0 1 1 0 1 1 1 1
2 b 0 1 0 0 0 0 1 0 1 0
3 c 0 1 1 0 0 1 1 0 1 0
4 d 0 0 1 0 0 0 1 1 0 0
5 e NA NA 0 NA NA 1 1 1 1 1
到目前为止我已经试过了
library(tidyverse)
df2 <- df %>%
mutate(across(ends_with('time_2'), replace_na, 0)) %>%
mutate(across(ends_with('time_1'), replace_na, 0)) %>%
mutate(item_1_check = if_else(item_1_time_1 == 1 & item_1_time_2 == 0, 0, 1),
item_2_check = if_else(item_2_time_1 == 1 & item_2_time_2 == 0, 0, 1),
item_3_check = if_else(item_3_time_1 == 1 & item_3_time_2 == 0, 0, 1)) %>%
mutate(item_check = pmin(item_1_check, item_2_check, item_3_check))
我想概括上述 mutate 调用,以便它们可以处理 n 多个项目,而不仅仅是 3 个。 有没有一种方法可以使用 ends_with('check')
最后的变异?变量名称不变,但针对项目编号和时间段。
一个选项是重塑为 'long' 格式并执行一次
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -user, names_to = c('group', '.value'),
names_sep="_(?=time)") %>%
mutate(across(starts_with('time'), replace_na, 0)) %>%
group_by(group) %>%
transmute(user, check = !(time_1 & !time_2)) %>%
ungroup %>%
group_by(user) %>%
summarise(check = min(check), .groups = 'drop') %>%
right_join(df, .) %>%
select(names(df), check)
# user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 check
#1 a 1 1 0 1 1 0 1
#2 b 0 1 0 0 0 0 0
#3 c 0 1 1 0 0 1 0
#4 d 0 0 0 0 0 0 1
#5 e NA NA 0 NA NA 1 1
或使用base R
df$check <- +( Reduce(`&`, lapply(split.default(replace(df[-1],
is.na(df[-1]), 0), sub("time_\d+", "", names(df)[-1])),
function(x) !(x[[1]] & !x[[2]]))))
我有一个数据框,其中包含 3 个与时间段 1 相关的二进制变量和三个与时间 2 相关的相应变量。
df <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,0,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1))
df
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2
1 a 1 1 0 1 1 0
2 b 0 1 0 0 0 0
3 c 0 1 1 0 0 1
4 d 0 0 1 0 0 0
5 e NA NA 0 NA NA 1
我想知道对于给定的 item
,在第 1 期观察是否有 1
,但在第 2 期没有。此外,我想知道观察是否有任何一个项目在第 1 期而不是第 2 期 1
所以理想的输出看起来像
df2 <- data.frame("user" = c("a","b","c","d","e"), "item_1_time_1" = c(1,0,0,0,NA), "item_2_time_1" = c(1,1,1,0,NA), "item_3_time_1" = c(0,0,1,1,0), "item_1_time_2" = c(1,0,0,0,NA), "item_2_time_2" = c(1,0,0,0,NA), "item_3_time_2" = c(0,0,1,0,1), "item_1_check" = c(1,1,1,1,1), "item_2_check" = c(1,0,0,1,1), "item_3_check" = c(1,1,1,0,1), item_check = c(1,0,0,0,1))
df2
user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 item_1_check item_2_check item_3_check item_check
1 a 1 1 0 1 1 0 1 1 1 1
2 b 0 1 0 0 0 0 1 0 1 0
3 c 0 1 1 0 0 1 1 0 1 0
4 d 0 0 1 0 0 0 1 1 0 0
5 e NA NA 0 NA NA 1 1 1 1 1
到目前为止我已经试过了
library(tidyverse)
df2 <- df %>%
mutate(across(ends_with('time_2'), replace_na, 0)) %>%
mutate(across(ends_with('time_1'), replace_na, 0)) %>%
mutate(item_1_check = if_else(item_1_time_1 == 1 & item_1_time_2 == 0, 0, 1),
item_2_check = if_else(item_2_time_1 == 1 & item_2_time_2 == 0, 0, 1),
item_3_check = if_else(item_3_time_1 == 1 & item_3_time_2 == 0, 0, 1)) %>%
mutate(item_check = pmin(item_1_check, item_2_check, item_3_check))
我想概括上述 mutate 调用,以便它们可以处理 n 多个项目,而不仅仅是 3 个。 有没有一种方法可以使用 ends_with('check')
最后的变异?变量名称不变,但针对项目编号和时间段。
一个选项是重塑为 'long' 格式并执行一次
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -user, names_to = c('group', '.value'),
names_sep="_(?=time)") %>%
mutate(across(starts_with('time'), replace_na, 0)) %>%
group_by(group) %>%
transmute(user, check = !(time_1 & !time_2)) %>%
ungroup %>%
group_by(user) %>%
summarise(check = min(check), .groups = 'drop') %>%
right_join(df, .) %>%
select(names(df), check)
# user item_1_time_1 item_2_time_1 item_3_time_1 item_1_time_2 item_2_time_2 item_3_time_2 check
#1 a 1 1 0 1 1 0 1
#2 b 0 1 0 0 0 0 0
#3 c 0 1 1 0 0 1 0
#4 d 0 0 0 0 0 0 1
#5 e NA NA 0 NA NA 1 1
或使用base R
df$check <- +( Reduce(`&`, lapply(split.default(replace(df[-1],
is.na(df[-1]), 0), sub("time_\d+", "", names(df)[-1])),
function(x) !(x[[1]] & !x[[2]]))))