根据列 r 上的两个条件删除重复行
removing duplicate rows based on two conditionals on columns r
我正在尝试删除重复项并保留具有最大值的行。我可以根据策略单独执行此操作。
但是,当基于策略基于两个单独的条件尝试执行此操作时,数据框在尝试应用这些条件时往往会相互覆盖。
这是必需的,因为一种策略包含一种策略具有而另一种策略不具有的值;不过请注意,它们确实共享一个公共列。
当前数据
ID strategy Common DNA_Col RNA_Col
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABB RNA 0.65 NA 0.12
4 ABC DNA 0.55 0.88 NA
5 ABC DNA 0.14 0.14 NA
6 ABC DNA 0.15 0.50 NA
7 ABD RNA 0.25 NA 0.12
所需的数据帧
ID strategy Common DNA_Col RNA_Col
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABC DNA 0.55 0.88 NA
4 ABD RNA 0.25 NA 0.12
代码
生成数据框:
> df <- data.frame(
+ stringsAsFactors = FALSE,
+ ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
+ strategy =c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
+ Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
+ DNA_Col= c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
+ RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
+ )
应用条件
if (df$strategy == "RNA") {
df = df %>% group_by(id) %>% slice_max(RNA_Col, n=1) %>% ungroup
} else if (df$strategy == "DNA") {
df = df %>% group_by(df) %>% slice_max(DNA_Col, n=1) %>% ungroup
}
这可以通过使用 pivot_longer()
函数将 RNA_Col
和 DNA_Col
变量的值放入一个单独的列中同时处理来完成,然后可以用于使用 ifelse()
.
重新填充列
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
strategy =c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
DNA_Col= c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
)
df %>%
pivot_longer(cols = c(DNA_Col, RNA_Col),
names_to = "Original_Col",
values_to = "Value") %>%
group_by(ID) %>%
slice_max(Value, n = 1) %>%
ungroup() %>%
mutate(DNA_Col = ifelse(
Original_Col == "DNA_Col", Value, NA
),
RNA_Col = ifelse(
Original_Col == "RNA_Col", Value, NA
)) %>%
select(ID, strategy, Common, DNA_Col, RNA_Col)
# A tibble: 4 × 5
ID strategy Common DNA_Col RNA_Col
<chr> <chr> <dbl> <dbl> <dbl>
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABC DNA 0.55 0.88 NA
4 ABD RNA 0.25 NA 0.12
可能的解决方案:
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
strategy = c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
DNA_Col = c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
)
df %>%
group_by(ID) %>%
slice_max(DNA_Col, n = 1) %>%
ungroup %>%
bind_rows(
df %>%
group_by(ID) %>%
slice_max(RNA_Col, n = 1) %>%
ungroup)
#> # A tibble: 4 × 5
#> ID strategy Common DNA_Col RNA_Col
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 ABA DNA 0.65 0.66 NA
#> 2 ABC DNA 0.55 0.88 NA
#> 3 ABB RNA 0.65 NA 0.15
#> 4 ABD RNA 0.25 NA 0.12
我正在尝试删除重复项并保留具有最大值的行。我可以根据策略单独执行此操作。
但是,当基于策略基于两个单独的条件尝试执行此操作时,数据框在尝试应用这些条件时往往会相互覆盖。
这是必需的,因为一种策略包含一种策略具有而另一种策略不具有的值;不过请注意,它们确实共享一个公共列。
当前数据
ID strategy Common DNA_Col RNA_Col
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABB RNA 0.65 NA 0.12
4 ABC DNA 0.55 0.88 NA
5 ABC DNA 0.14 0.14 NA
6 ABC DNA 0.15 0.50 NA
7 ABD RNA 0.25 NA 0.12
所需的数据帧
ID strategy Common DNA_Col RNA_Col
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABC DNA 0.55 0.88 NA
4 ABD RNA 0.25 NA 0.12
代码
生成数据框:
> df <- data.frame(
+ stringsAsFactors = FALSE,
+ ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
+ strategy =c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
+ Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
+ DNA_Col= c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
+ RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
+ )
应用条件
if (df$strategy == "RNA") {
df = df %>% group_by(id) %>% slice_max(RNA_Col, n=1) %>% ungroup
} else if (df$strategy == "DNA") {
df = df %>% group_by(df) %>% slice_max(DNA_Col, n=1) %>% ungroup
}
这可以通过使用 pivot_longer()
函数将 RNA_Col
和 DNA_Col
变量的值放入一个单独的列中同时处理来完成,然后可以用于使用 ifelse()
.
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
strategy =c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
DNA_Col= c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
)
df %>%
pivot_longer(cols = c(DNA_Col, RNA_Col),
names_to = "Original_Col",
values_to = "Value") %>%
group_by(ID) %>%
slice_max(Value, n = 1) %>%
ungroup() %>%
mutate(DNA_Col = ifelse(
Original_Col == "DNA_Col", Value, NA
),
RNA_Col = ifelse(
Original_Col == "RNA_Col", Value, NA
)) %>%
select(ID, strategy, Common, DNA_Col, RNA_Col)
# A tibble: 4 × 5
ID strategy Common DNA_Col RNA_Col
<chr> <chr> <dbl> <dbl> <dbl>
1 ABA DNA 0.65 0.66 NA
2 ABB RNA 0.65 NA 0.15
3 ABC DNA 0.55 0.88 NA
4 ABD RNA 0.25 NA 0.12
可能的解决方案:
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c("ABA", "ABB", "ABB", "ABC", "ABC", "ABC", "ABD"),
strategy = c("DNA", "RNA", "RNA", "DNA", "DNA", "DNA", "RNA"),
Common = c(0.65, 0.65, 0.65, 0.55, 0.14, 0.15, 0.25),
DNA_Col = c(0.66, NA, NA, 0.88, 0.14, 0.5, NA),
RNA_Col = c(NA, 0.15, 0.12, NA, NA, NA, 0.12)
)
df %>%
group_by(ID) %>%
slice_max(DNA_Col, n = 1) %>%
ungroup %>%
bind_rows(
df %>%
group_by(ID) %>%
slice_max(RNA_Col, n = 1) %>%
ungroup)
#> # A tibble: 4 × 5
#> ID strategy Common DNA_Col RNA_Col
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 ABA DNA 0.65 0.66 NA
#> 2 ABC DNA 0.55 0.88 NA
#> 3 ABB RNA 0.65 NA 0.15
#> 4 ABD RNA 0.25 NA 0.12