逐行检测序列
Detect sequences rowwise
请参阅玩具数据以创建列“检查”(解决方案)。如果序列中有 3 个值(从 2018 年到 2021 年,按行)>=20,则该值应为 TRUE,否则为 FALSE。
首选 dplyr 解决方案。原始数据集有数百列和数千行。 NA 可以在任何地方。
test<-data.frame(country=c("US","UK","RU","GR","BE"),
"y_2018"=c(NA,30,20,40,10),
"y_2019"=c(10,10,20,20,20),
"y_2020"=c(20,NA,30,20,20),
"y_2021"=c(NA,70,10,10,NA),
check=c(F,F,T,T,F))
这是使用 rle
-
的方法
library(dplyr)
test %>%
rowwise() %>%
mutate(check = {
tmp <- rle(c_across(starts_with('y')) >= 20)
any(tmp$lengths[tmp$values] >= 3, na.rm = TRUE)
}) %>%
ungroup
# country y_2018 y_2019 y_2020 y_2021 check
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
#1 US NA 10 20 NA FALSE
#2 UK 30 10 NA 70 FALSE
#3 RU 20 20 30 10 TRUE
#4 GR 40 20 20 10 TRUE
#5 BE 10 20 20 NA FALSE
另一种不使用 dplyr 的方法(我更改了测试数据以包含 30, 10, 20, 40)
案例):
test<-data.frame(country=c("US","UK","RU","GR","BE"),
"y_2018"=c(NA,30,20,30,10),
"y_2019"=c(10,10,20,10,20),
"y_2020"=c(20,NA,30,20,20),
"y_2021"=c(NA,70,10,40,NA),
check=c(F,F,T,T,F))
rownames(test) <- seq(1:nrow(test))
# Choose only rows whose number of columns with x >=20 is greater than 2
cond1 <- apply(test[2:5], 2, function(x) x >= 20)
cond2 <- apply(cond1, 1, function(x) length(which(x==TRUE)) > 2)
data1 <- data.frame(cond1[cond2,])
# Check if the condition of x>=20 is sequential across columns
aa <- ifelse(data1$y_2018 == data1$y_2019 & data1$y_2019 == data1$y_2020, TRUE, FALSE)
bb <- ifelse(data1$y_2019 == data1$y_2020 & data1$y_2020 == data1$y_2021, TRUE, FALSE)
cc <- data.frame(aa, bb)
cc$final <- ifelse(cc$aa==TRUE | cc$bb==TRUE, TRUE, FALSE)
# Select the appropriate rows
data2 <- data.frame(data1[cc$final,])
test2 <- test[rownames(data2),]
使用 collapse
中的 dapply
library(collapse)
library(data.table)
test$check <- dapply(get_vars(test, is.numeric), MARGIN = 1,
FUN = function(x) any(table(NA^(x < 20)*rleid(x >= 20))>=3))
test$check
[1] FALSE FALSE TRUE TRUE FALSE
请参阅玩具数据以创建列“检查”(解决方案)。如果序列中有 3 个值(从 2018 年到 2021 年,按行)>=20,则该值应为 TRUE,否则为 FALSE。
首选 dplyr 解决方案。原始数据集有数百列和数千行。 NA 可以在任何地方。
test<-data.frame(country=c("US","UK","RU","GR","BE"),
"y_2018"=c(NA,30,20,40,10),
"y_2019"=c(10,10,20,20,20),
"y_2020"=c(20,NA,30,20,20),
"y_2021"=c(NA,70,10,10,NA),
check=c(F,F,T,T,F))
这是使用 rle
-
library(dplyr)
test %>%
rowwise() %>%
mutate(check = {
tmp <- rle(c_across(starts_with('y')) >= 20)
any(tmp$lengths[tmp$values] >= 3, na.rm = TRUE)
}) %>%
ungroup
# country y_2018 y_2019 y_2020 y_2021 check
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
#1 US NA 10 20 NA FALSE
#2 UK 30 10 NA 70 FALSE
#3 RU 20 20 30 10 TRUE
#4 GR 40 20 20 10 TRUE
#5 BE 10 20 20 NA FALSE
另一种不使用 dplyr 的方法(我更改了测试数据以包含 30, 10, 20, 40)
案例):
test<-data.frame(country=c("US","UK","RU","GR","BE"),
"y_2018"=c(NA,30,20,30,10),
"y_2019"=c(10,10,20,10,20),
"y_2020"=c(20,NA,30,20,20),
"y_2021"=c(NA,70,10,40,NA),
check=c(F,F,T,T,F))
rownames(test) <- seq(1:nrow(test))
# Choose only rows whose number of columns with x >=20 is greater than 2
cond1 <- apply(test[2:5], 2, function(x) x >= 20)
cond2 <- apply(cond1, 1, function(x) length(which(x==TRUE)) > 2)
data1 <- data.frame(cond1[cond2,])
# Check if the condition of x>=20 is sequential across columns
aa <- ifelse(data1$y_2018 == data1$y_2019 & data1$y_2019 == data1$y_2020, TRUE, FALSE)
bb <- ifelse(data1$y_2019 == data1$y_2020 & data1$y_2020 == data1$y_2021, TRUE, FALSE)
cc <- data.frame(aa, bb)
cc$final <- ifelse(cc$aa==TRUE | cc$bb==TRUE, TRUE, FALSE)
# Select the appropriate rows
data2 <- data.frame(data1[cc$final,])
test2 <- test[rownames(data2),]
使用 collapse
dapply
library(collapse)
library(data.table)
test$check <- dapply(get_vars(test, is.numeric), MARGIN = 1,
FUN = function(x) any(table(NA^(x < 20)*rleid(x >= 20))>=3))
test$check
[1] FALSE FALSE TRUE TRUE FALSE