R - 根据数据框中的条件按组设置值

R - Set values by group based on a condition in a dataframe

我有以下数据集。

group value row_name overlaps
group_a 4 1 2
group_a 5 2 3, 5
group_a 48 3 4, 5
group_a 54 4 5
group_a 12 5
group_b 12 6 7
group_b 1 7

重叠列表示哪些行具有特定的 'overlap'。

我想,仅针对值大于 10 的行,将相应 [=34= 中指示的所有行的值列中的数据替换为零] 列。

预期输出:

group value row_name overlaps
group_a 4 1 2
group_a 5 2 3, 5
group_a 48 3 4, 5
group_a 0 4 5
group_a 0 5
group_b 12 6 7
group_b 0 7

可重现的例子:

data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
                             "group_a", "group_b", "group_b"),
                   value = c(4, 5 , 48, 54, 12, 12, 1),
                   row_name = c("1", "2", "3", "4", "5", "6", "7"),
                   overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))

不知道是不是很复杂的问题,卡了好几个小时也没弄明白怎么解决。

有人对我如何使用 dplyr 或 data.table 解决这个问题有任何建议吗?

strsplit overlaps 列,仅子集为 data$value > 10 的那些,然后使用那组不同的 row_name 用 [=17 覆盖原始数据=]:

gr10 <- data$value > 10
sel <- Map(paste, data$group[gr10], strsplit(data$overlaps, ",\s+")[gr10], sep="|")
sel <- Reduce(union, sel)
sel
#[1] "group_a|4" "group_a|5" "group_a|"  "group_b|7"
data$value[do.call(paste, c(data[c("group","row_name")], sep="|")) %in% sel] <- 0
data
#    group value row_name overlaps
#1 group_a     4        1        2
#2 group_a     5        2     3, 5
#3 group_a    48        3     4, 5
#4 group_a     0        4        5
#5 group_a     0        5         
#6 group_b    12        6        7
#7 group_b     0        7         

如果 row_name 在整个数据集中是唯一的,您可以使用更简单的逻辑:

sel <- Reduce(union, strsplit(data$overlaps, ",\s+")[data$value > 10])
sel
#[1] "4" "5" "7"
data$value[data$row_name %in% sel] <- 0

奖金data.table解决方案:

library(data.table)
setDT(data)

data[
  data[value > 10, .(row_name=unlist(strsplit(overlaps, ",\s+"))), by=group],
  on=.(group, row_name),
  value := 0
]

另一种解决方案,基于tidyverse

library(tidyverse)

data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
                             "group_a", "group_b", "group_b"),
                   value = c(4, 5 , 48, 54, 12, 12, 1),
                   row_name = c("1", "2", "3", "4", "5", "6", "7"),
                   overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))

data %>% 
  separate(
    overlaps, into=c("o1", "o2"), sep=", ", fill="right", remove=F) %>% 
  mutate(across(o1:o2, ~ ifelse(value > 10, get(cur_column()), 0)),
         value = ifelse(row_number() %in% c_across(o1:o2), 0, value)) %>% 
  select(-o1, -o2)

#>     group value row_name overlaps
#> 1 group_a     4        1        2
#> 2 group_a     5        2     3, 5
#> 3 group_a    48        3     4, 5
#> 4 group_a     0        4        5
#> 5 group_a     0        5         
#> 6 group_b    12        6        7
#> 7 group_b     0        7