使用 grep 或 dplyr 有条件地删除行并替换其他行?
Using grep or dplyr to conditionally remove rows and replace others?
我有一个如下所示的数据框:
IDD <- c("999674642", "999269097", "998496846", "998496846", "998067840", "998067840")
Valve <- c("1", "1", "0", "1", "0", "1")
Seconds <- c("NA", "NA", "12", "NA", "5", "NA")
df_rep <- data.frame(IDD, Valve, Seconds)
有些 'IDD' 值是重复的,有些不是。
对于重复的 IDD 列,我希望 R 将 NA 替换为 0,然后按 IDD 分组。取每组“秒数”列中的最大值,并仅保留该行。
下面的 df 是我试图实现的输出示例。
IDD2 <- c("999674642", "999269097", "998496846", "998067840")
Valve2 <- c("1", "1", "0", "0")
Seconds2 <- c("0", "0", "12", "5")
df_rep2 <- data.frame(IDD2, Valve2, Seconds2)
试试这个(使用您提供的数据)并更新:
library(tidyverse)
#Data
df_rep <- data.frame(IDD, Valve, Seconds,stringsAsFactors = F)
#Replace all NA with zero
df_rep[df_rep=='NA']<-0
#Code
df_rep %>% group_by(IDD) %>% mutate(key=1:n(),
Flag=ifelse(key==2 & Seconds==0,1,0)) %>%
filter(Flag!=1) %>% ungroup() %>% select(-c(key,Flag))
产生:
# A tibble: 4 x 3
IDD Valve Seconds
<chr> <chr> <chr>
1 999674642 1 0
2 999269097 1 0
3 998496846 0 12
4 998067840 0 5
在 Base R 中你可以这样做:
df_rep$Seconds <- as.numeric(df_rep$Seconds)
df1 <- subset(df_rep,!is.na(Seconds)|ave(Seconds, IDD, FUN = length)==1)
transform(df1, Seconds = replace(Seconds,is.na(Seconds),0))
IDD Valve Seconds
1 999674642 1 0
2 999269097 1 0
3 998496846 0 12
5 998067840 0 5
使用 tidyverse
中的这种方法,您只需要使用简单的动词
IDD <- c("999674642", "999269097", "998496846", "998496846", "998067840", "998067840")
Valve <- c("1", "1", "0", "1", "0", "1")
Seconds <- c("NA", "NA", "12", "NA", "5", "NA")
df_rep <- data.frame(obs = 1:6, IDD, Valve, Seconds)
df2 <- df_rep %>%
mutate(Seconds = ifelse(duplicated(IDD) & Seconds == "NA" | Seconds == "NA", 0, Seconds)) %>%
group_by(IDD) %>%
arrange(desc(IDD, Seconds)) %>%
slice(1) %>%
ungroup() %>%
arrange(obs)
# obs IDD Valve Seconds
# <int> <chr> <chr> <chr>
# 1 1 999674642 1 0
# 2 2 999269097 1 0
# 3 3 998496846 0 12
# 4 5 998067840 0 5
我有一个如下所示的数据框:
IDD <- c("999674642", "999269097", "998496846", "998496846", "998067840", "998067840")
Valve <- c("1", "1", "0", "1", "0", "1")
Seconds <- c("NA", "NA", "12", "NA", "5", "NA")
df_rep <- data.frame(IDD, Valve, Seconds)
有些 'IDD' 值是重复的,有些不是。 对于重复的 IDD 列,我希望 R 将 NA 替换为 0,然后按 IDD 分组。取每组“秒数”列中的最大值,并仅保留该行。
下面的 df 是我试图实现的输出示例。
IDD2 <- c("999674642", "999269097", "998496846", "998067840")
Valve2 <- c("1", "1", "0", "0")
Seconds2 <- c("0", "0", "12", "5")
df_rep2 <- data.frame(IDD2, Valve2, Seconds2)
试试这个(使用您提供的数据)并更新:
library(tidyverse)
#Data
df_rep <- data.frame(IDD, Valve, Seconds,stringsAsFactors = F)
#Replace all NA with zero
df_rep[df_rep=='NA']<-0
#Code
df_rep %>% group_by(IDD) %>% mutate(key=1:n(),
Flag=ifelse(key==2 & Seconds==0,1,0)) %>%
filter(Flag!=1) %>% ungroup() %>% select(-c(key,Flag))
产生:
# A tibble: 4 x 3
IDD Valve Seconds
<chr> <chr> <chr>
1 999674642 1 0
2 999269097 1 0
3 998496846 0 12
4 998067840 0 5
在 Base R 中你可以这样做:
df_rep$Seconds <- as.numeric(df_rep$Seconds)
df1 <- subset(df_rep,!is.na(Seconds)|ave(Seconds, IDD, FUN = length)==1)
transform(df1, Seconds = replace(Seconds,is.na(Seconds),0))
IDD Valve Seconds
1 999674642 1 0
2 999269097 1 0
3 998496846 0 12
5 998067840 0 5
使用 tidyverse
中的这种方法,您只需要使用简单的动词
IDD <- c("999674642", "999269097", "998496846", "998496846", "998067840", "998067840")
Valve <- c("1", "1", "0", "1", "0", "1")
Seconds <- c("NA", "NA", "12", "NA", "5", "NA")
df_rep <- data.frame(obs = 1:6, IDD, Valve, Seconds)
df2 <- df_rep %>%
mutate(Seconds = ifelse(duplicated(IDD) & Seconds == "NA" | Seconds == "NA", 0, Seconds)) %>%
group_by(IDD) %>%
arrange(desc(IDD, Seconds)) %>%
slice(1) %>%
ungroup() %>%
arrange(obs)
# obs IDD Valve Seconds
# <int> <chr> <chr> <chr>
# 1 1 999674642 1 0
# 2 2 999269097 1 0
# 3 3 998496846 0 12
# 4 5 998067840 0 5