删除特定列中特定序列值之后的行
Delete rows after a certain sequence of values in a certain column
a <- c("A","A","A","B","B","B","C","C","C","C","D","D","D","D","D")
b <- c("x","y","z","x","x","z","y","z","z","z","y","z","z","z","x")
df = data.frame(a,b)
a b
1 A x
2 A y
3 A z
4 B x
5 B x
6 B z
7 C y
8 C z
9 C z
10 C z
11 D y
12 D z
13 D z
14 D z
15 D x
对于每个组 A、B、C、D,每次组合 y、z 出现在组末尾时,我都想删除 b 列中的值 z。
如果我们遇到 a=="C" 的情况,其中 b 值为 y,z,z,z,我想删除所有 z。但是,在 a=="D" 中,无需更改,因为 x 是最后一个值。
结果如下所示:
a b
1 A x
2 A y
4 B x
5 B x
6 B z
7 C y
11 D y
12 D z
13 D z
14 D z
15 D x
通过在 dplyr
中分组,我可以识别 A 中每个值的最后一次出现,因此 a=="A"
中描述的基本情况不是问题。我很难为 a=="C"
的情况找到解决方案,在这种情况下,我可能会出现一次 y,然后出现 20 次 z。
这是一个可能的 data.table
解决方案。基本上,我正在创建一个同时满足 3 个条件的逻辑索引:作为 z
,第一个 z
在 y
之后,最后一个值是 z
然后我只是在评估它。
library(data.table)
setDT(df)[, indx := b == "z" &
max(which(b == "z")) == .N &
ifelse(min(which(b == "z")) == 1L,
TRUE,
b[min(which(b == "z")) - 1L] == "y"),
by = a][!(indx)]
# a b indx
# 1: A x FALSE
# 2: A y FALSE
# 3: B x FALSE
# 4: B x FALSE
# 5: B z FALSE
# 6: C y FALSE
# 7: D y FALSE
# 8: D z FALSE
# 9: D z FALSE
# 10: D z FALSE
# 11: D x FALSE
这是一个基本解决方案:
do.call("rbind", by(df, df$a, FUN = function(x) {
if(x$b[length(x$b)] == "z") {
y <- which(x$b == "y")
if(!length(y)) {
return(x)
}
z <- which(x$b == "z")
if(!length(z)) {
return(x)
}
# check if y isn't immediately before z
if(max(y) - min(z) > 1) {
return(x)
} else {
return(x[-z,])
}
} else {
return(x)
}
}))
结果:
a b
A.1 A x
A.2 A y
B.4 B x
B.5 B x
B.6 B z
C C y
D.11 D y
D.12 D z
D.13 D z
D.14 D z
D.15 D x
您可以在基数 R:
中使用 by
和 cummin
df[unlist(by(df$b, interaction(df$a), FUN = function(x) {
tmp <- rev(cummin(rev(x == "z")))
if (tail(x[!tmp], 1) == "y") !tmp else rep(TRUE, length(x))
})), ]
结果:
a b
1 A x
2 A y
4 B x
5 B x
6 B z
7 C y
11 D y
12 D z
13 D z
14 D z
15 D x
效率不高,但工作正常:
require(stringr)
df2 <- data.frame(row.names = c("a", "b"))
for(i in levels(factor(df$a))) {
temp <- paste(df$b[df$a == i], collapse = "")
if(str_detect(temp, "yz") & str_detect(temp, "z$")) {
temp <- gsub("z", "", temp)
df2 <- rbind(df2, data.frame(a = rep(i, nchar(temp)), b = substring(temp, seq(1,nchar(temp),1), seq(1,nchar(temp),1))))
} else df2 <- rbind(df2, data.frame(a = rep(i, nchar(temp)), b = substring(temp, seq(1,nchar(temp),1), seq(1,nchar(temp),1))))
}
# a b
# 1 A x
# 2 A y
# 3 B x
# 4 B x
# 5 B z
# 6 C y
# 7 D y
# 8 D z
# 9 D z
# 10 D z
# 11 D x
a <- c("A","A","A","B","B","B","C","C","C","C","D","D","D","D","D")
b <- c("x","y","z","x","x","z","y","z","z","z","y","z","z","z","x")
df = data.frame(a,b)
a b
1 A x
2 A y
3 A z
4 B x
5 B x
6 B z
7 C y
8 C z
9 C z
10 C z
11 D y
12 D z
13 D z
14 D z
15 D x
对于每个组 A、B、C、D,每次组合 y、z 出现在组末尾时,我都想删除 b 列中的值 z。
如果我们遇到 a=="C" 的情况,其中 b 值为 y,z,z,z,我想删除所有 z。但是,在 a=="D" 中,无需更改,因为 x 是最后一个值。
结果如下所示:
a b
1 A x
2 A y
4 B x
5 B x
6 B z
7 C y
11 D y
12 D z
13 D z
14 D z
15 D x
通过在 dplyr
中分组,我可以识别 A 中每个值的最后一次出现,因此 a=="A"
中描述的基本情况不是问题。我很难为 a=="C"
的情况找到解决方案,在这种情况下,我可能会出现一次 y,然后出现 20 次 z。
这是一个可能的 data.table
解决方案。基本上,我正在创建一个同时满足 3 个条件的逻辑索引:作为 z
,第一个 z
在 y
之后,最后一个值是 z
然后我只是在评估它。
library(data.table)
setDT(df)[, indx := b == "z" &
max(which(b == "z")) == .N &
ifelse(min(which(b == "z")) == 1L,
TRUE,
b[min(which(b == "z")) - 1L] == "y"),
by = a][!(indx)]
# a b indx
# 1: A x FALSE
# 2: A y FALSE
# 3: B x FALSE
# 4: B x FALSE
# 5: B z FALSE
# 6: C y FALSE
# 7: D y FALSE
# 8: D z FALSE
# 9: D z FALSE
# 10: D z FALSE
# 11: D x FALSE
这是一个基本解决方案:
do.call("rbind", by(df, df$a, FUN = function(x) {
if(x$b[length(x$b)] == "z") {
y <- which(x$b == "y")
if(!length(y)) {
return(x)
}
z <- which(x$b == "z")
if(!length(z)) {
return(x)
}
# check if y isn't immediately before z
if(max(y) - min(z) > 1) {
return(x)
} else {
return(x[-z,])
}
} else {
return(x)
}
}))
结果:
a b
A.1 A x
A.2 A y
B.4 B x
B.5 B x
B.6 B z
C C y
D.11 D y
D.12 D z
D.13 D z
D.14 D z
D.15 D x
您可以在基数 R:
中使用by
和 cummin
df[unlist(by(df$b, interaction(df$a), FUN = function(x) {
tmp <- rev(cummin(rev(x == "z")))
if (tail(x[!tmp], 1) == "y") !tmp else rep(TRUE, length(x))
})), ]
结果:
a b
1 A x
2 A y
4 B x
5 B x
6 B z
7 C y
11 D y
12 D z
13 D z
14 D z
15 D x
效率不高,但工作正常:
require(stringr)
df2 <- data.frame(row.names = c("a", "b"))
for(i in levels(factor(df$a))) {
temp <- paste(df$b[df$a == i], collapse = "")
if(str_detect(temp, "yz") & str_detect(temp, "z$")) {
temp <- gsub("z", "", temp)
df2 <- rbind(df2, data.frame(a = rep(i, nchar(temp)), b = substring(temp, seq(1,nchar(temp),1), seq(1,nchar(temp),1))))
} else df2 <- rbind(df2, data.frame(a = rep(i, nchar(temp)), b = substring(temp, seq(1,nchar(temp),1), seq(1,nchar(temp),1))))
}
# a b
# 1 A x
# 2 A y
# 3 B x
# 4 B x
# 5 B z
# 6 C y
# 7 D y
# 8 D z
# 9 D z
# 10 D z
# 11 D x