R - 根据数据框中的条件按组设置值
R - Set values by group based on a condition in a dataframe
我有以下数据集。
group
value
row_name
overlaps
group_a
4
1
2
group_a
5
2
3, 5
group_a
48
3
4, 5
group_a
54
4
5
group_a
12
5
group_b
12
6
7
group_b
1
7
重叠列表示哪些行具有特定的 'overlap'。
我想,仅针对值大于 10 的行,将相应 [=34= 中指示的所有行的值列中的数据替换为零] 列。
预期输出:
group
value
row_name
overlaps
group_a
4
1
2
group_a
5
2
3, 5
group_a
48
3
4, 5
group_a
0
4
5
group_a
0
5
group_b
12
6
7
group_b
0
7
可重现的例子:
data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
"group_a", "group_b", "group_b"),
value = c(4, 5 , 48, 54, 12, 12, 1),
row_name = c("1", "2", "3", "4", "5", "6", "7"),
overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))
不知道是不是很复杂的问题,卡了好几个小时也没弄明白怎么解决。
有人对我如何使用 dplyr 或 data.table 解决这个问题有任何建议吗?
strsplit
overlaps
列,仅子集为 data$value > 10
的那些,然后使用那组不同的 row_name
用 [=17 覆盖原始数据=]:
gr10 <- data$value > 10
sel <- Map(paste, data$group[gr10], strsplit(data$overlaps, ",\s+")[gr10], sep="|")
sel <- Reduce(union, sel)
sel
#[1] "group_a|4" "group_a|5" "group_a|" "group_b|7"
data$value[do.call(paste, c(data[c("group","row_name")], sep="|")) %in% sel] <- 0
data
# group value row_name overlaps
#1 group_a 4 1 2
#2 group_a 5 2 3, 5
#3 group_a 48 3 4, 5
#4 group_a 0 4 5
#5 group_a 0 5
#6 group_b 12 6 7
#7 group_b 0 7
如果 row_name
在整个数据集中是唯一的,您可以使用更简单的逻辑:
sel <- Reduce(union, strsplit(data$overlaps, ",\s+")[data$value > 10])
sel
#[1] "4" "5" "7"
data$value[data$row_name %in% sel] <- 0
奖金data.table解决方案:
library(data.table)
setDT(data)
data[
data[value > 10, .(row_name=unlist(strsplit(overlaps, ",\s+"))), by=group],
on=.(group, row_name),
value := 0
]
另一种解决方案,基于tidyverse
:
library(tidyverse)
data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
"group_a", "group_b", "group_b"),
value = c(4, 5 , 48, 54, 12, 12, 1),
row_name = c("1", "2", "3", "4", "5", "6", "7"),
overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))
data %>%
separate(
overlaps, into=c("o1", "o2"), sep=", ", fill="right", remove=F) %>%
mutate(across(o1:o2, ~ ifelse(value > 10, get(cur_column()), 0)),
value = ifelse(row_number() %in% c_across(o1:o2), 0, value)) %>%
select(-o1, -o2)
#> group value row_name overlaps
#> 1 group_a 4 1 2
#> 2 group_a 5 2 3, 5
#> 3 group_a 48 3 4, 5
#> 4 group_a 0 4 5
#> 5 group_a 0 5
#> 6 group_b 12 6 7
#> 7 group_b 0 7
我有以下数据集。
group | value | row_name | overlaps |
---|---|---|---|
group_a | 4 | 1 | 2 |
group_a | 5 | 2 | 3, 5 |
group_a | 48 | 3 | 4, 5 |
group_a | 54 | 4 | 5 |
group_a | 12 | 5 | |
group_b | 12 | 6 | 7 |
group_b | 1 | 7 |
重叠列表示哪些行具有特定的 'overlap'。
我想,仅针对值大于 10 的行,将相应 [=34= 中指示的所有行的值列中的数据替换为零] 列。
预期输出:
group | value | row_name | overlaps |
---|---|---|---|
group_a | 4 | 1 | 2 |
group_a | 5 | 2 | 3, 5 |
group_a | 48 | 3 | 4, 5 |
group_a | 0 | 4 | 5 |
group_a | 0 | 5 | |
group_b | 12 | 6 | 7 |
group_b | 0 | 7 |
可重现的例子:
data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
"group_a", "group_b", "group_b"),
value = c(4, 5 , 48, 54, 12, 12, 1),
row_name = c("1", "2", "3", "4", "5", "6", "7"),
overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))
不知道是不是很复杂的问题,卡了好几个小时也没弄明白怎么解决。
有人对我如何使用 dplyr 或 data.table 解决这个问题有任何建议吗?
strsplit
overlaps
列,仅子集为 data$value > 10
的那些,然后使用那组不同的 row_name
用 [=17 覆盖原始数据=]:
gr10 <- data$value > 10
sel <- Map(paste, data$group[gr10], strsplit(data$overlaps, ",\s+")[gr10], sep="|")
sel <- Reduce(union, sel)
sel
#[1] "group_a|4" "group_a|5" "group_a|" "group_b|7"
data$value[do.call(paste, c(data[c("group","row_name")], sep="|")) %in% sel] <- 0
data
# group value row_name overlaps
#1 group_a 4 1 2
#2 group_a 5 2 3, 5
#3 group_a 48 3 4, 5
#4 group_a 0 4 5
#5 group_a 0 5
#6 group_b 12 6 7
#7 group_b 0 7
如果 row_name
在整个数据集中是唯一的,您可以使用更简单的逻辑:
sel <- Reduce(union, strsplit(data$overlaps, ",\s+")[data$value > 10])
sel
#[1] "4" "5" "7"
data$value[data$row_name %in% sel] <- 0
奖金data.table解决方案:
library(data.table)
setDT(data)
data[
data[value > 10, .(row_name=unlist(strsplit(overlaps, ",\s+"))), by=group],
on=.(group, row_name),
value := 0
]
另一种解决方案,基于tidyverse
:
library(tidyverse)
data <- data.frame(group = c("group_a", "group_a", "group_a", "group_a",
"group_a", "group_b", "group_b"),
value = c(4, 5 , 48, 54, 12, 12, 1),
row_name = c("1", "2", "3", "4", "5", "6", "7"),
overlaps = c("2", "3, 5", "4, 5", "5", "", "7", ""))
data %>%
separate(
overlaps, into=c("o1", "o2"), sep=", ", fill="right", remove=F) %>%
mutate(across(o1:o2, ~ ifelse(value > 10, get(cur_column()), 0)),
value = ifelse(row_number() %in% c_across(o1:o2), 0, value)) %>%
select(-o1, -o2)
#> group value row_name overlaps
#> 1 group_a 4 1 2
#> 2 group_a 5 2 3, 5
#> 3 group_a 48 3 4, 5
#> 4 group_a 0 4 5
#> 5 group_a 0 5
#> 6 group_b 12 6 7
#> 7 group_b 0 7