r 根据灵活时间内出现的次数筛选行 window
r filter rows based on number of occurrences within flexible time window
(简化的)情况如下:我有一个包含 subjectID(分组变量)、sampleID(每个 subjectID 组中的唯一变量)和 sampleTime(收集 sampleID 的时间)的数据框。如果在任何给定的 4 小时内收集了至少 4 个样本,我只想过滤这些行。例如,可以包含一个样本,因为一个样本是在当前样本之前 2 小时收集的,而另外两个样本是在当前样本之后 1 小时和 2 小时收集的。另一个例子:如果前一个样本是在当前样本之前 5 小时收集的,而接下来的三个样本是在当前样本之前 1 小时、2 小时和 6 小时收集的,则不应包括该样本。时间window可以两边都开,偶尔可以同时采集两个样本。
在输入这个问题时,我找到了一个适用于这个简单案例并表明我的意图的答案:
library(tidyverse)
set.seed(6354363)
#create some data
df <- data.frame(subjectID = rep(paste0("ID", 1:4), each=20)) %>%
group_by(subjectID) %>%
mutate(time=sample(0:72, size=20, replace=T)) %>%
arrange(subjectID, time) %>%
mutate(sampleID = 1:20)
#create column incl to mark which rows to include
df <- df %>% group_by(subjectID) %>%
mutate(incl = case_when(time-lag(time, n=3) <= 4 ~ T,
lead(time, n=3) - time <= 4 ~ T,
lead(time, n=2) - lag(time, n=1) <= 4 ~ T,
lead(time, n=1) - lag(time, n=2) <= 4 ~ T))
#filter gives intended solution
df %>% filter(incl)
但是,我觉得这种方法不必要地复杂并且在其他情况下变得太麻烦了,例如当我在给定时间段内想要至少 20 个样本时,我正在寻找更通用的解决方案。我尝试研究 rollapply 和其他功能,但还没有成功。有没有更优雅(和有效)的方法来做到这一点?我更喜欢 tidyverse 解决方案,但 data.table 也可以。
library(tidyverse)
library(zoo)
df <- structure(list(subjectID = c("ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4"), time = c(2L, 2L, 5L, 5L, 6L, 7L, 9L, 14L, 16L, 22L, 28L, 31L, 38L, 40L, 42L, 48L, 53L, 58L, 63L, 69L, 0L, 4L, 8L, 9L, 12L, 14L, 16L, 23L, 29L, 30L, 32L, 38L, 42L, 47L, 51L, 57L, 60L, 63L, 67L, 69L, 1L, 1L, 3L, 6L, 8L, 10L, 23L, 26L, 29L, 30L, 34L, 40L, 41L, 41L, 46L, 51L, 53L, 56L, 57L, 69L, 6L, 10L, 10L, 12L, 16L, 21L, 28L, 29L, 31L, 35L, 39L, 40L, 41L, 42L, 44L, 48L, 50L, 52L, 54L, 60L), sampleID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), incl = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -80L), groups = structure(list( subjectID = c("ID1", "ID2", "ID3", "ID4"), .rows = structure(list( 1:20, 21:40, 41:60, 61:80), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), .drop = TRUE))
tr <- 4 # Time range
gr <- 4 # Group range
df <- df %>%
mutate(
incl2 = c(diff(time), NA) %>%
rollapply(gr-1, sum, fill = NA, align = "left") %>%
`<=`(tr) %>%
rollapply(., gr, any, na.rm = T, fill = ., align = "right")
)
identical(df$incl, df$incl2)
#> [1] TRUE
df
#> # A tibble: 80 × 5
#> # Groups: subjectID [4]
#> subjectID time sampleID incl incl2
#> <chr> <int> <int> <lgl> <lgl>
#> 1 ID1 2 1 TRUE TRUE
#> 2 ID1 2 2 TRUE TRUE
#> 3 ID1 5 3 TRUE TRUE
#> 4 ID1 5 4 TRUE TRUE
#> 5 ID1 6 5 TRUE TRUE
#> 6 ID1 7 6 TRUE TRUE
#> 7 ID1 9 7 TRUE TRUE
#> 8 ID1 14 8 FALSE FALSE
#> 9 ID1 16 9 FALSE FALSE
#> 10 ID1 22 10 FALSE FALSE
#> # … with 70 more rows
由 reprex package (v2.0.1)
于 2021-09-09 创建
(简化的)情况如下:我有一个包含 subjectID(分组变量)、sampleID(每个 subjectID 组中的唯一变量)和 sampleTime(收集 sampleID 的时间)的数据框。如果在任何给定的 4 小时内收集了至少 4 个样本,我只想过滤这些行。例如,可以包含一个样本,因为一个样本是在当前样本之前 2 小时收集的,而另外两个样本是在当前样本之后 1 小时和 2 小时收集的。另一个例子:如果前一个样本是在当前样本之前 5 小时收集的,而接下来的三个样本是在当前样本之前 1 小时、2 小时和 6 小时收集的,则不应包括该样本。时间window可以两边都开,偶尔可以同时采集两个样本。
在输入这个问题时,我找到了一个适用于这个简单案例并表明我的意图的答案:
library(tidyverse)
set.seed(6354363)
#create some data
df <- data.frame(subjectID = rep(paste0("ID", 1:4), each=20)) %>%
group_by(subjectID) %>%
mutate(time=sample(0:72, size=20, replace=T)) %>%
arrange(subjectID, time) %>%
mutate(sampleID = 1:20)
#create column incl to mark which rows to include
df <- df %>% group_by(subjectID) %>%
mutate(incl = case_when(time-lag(time, n=3) <= 4 ~ T,
lead(time, n=3) - time <= 4 ~ T,
lead(time, n=2) - lag(time, n=1) <= 4 ~ T,
lead(time, n=1) - lag(time, n=2) <= 4 ~ T))
#filter gives intended solution
df %>% filter(incl)
但是,我觉得这种方法不必要地复杂并且在其他情况下变得太麻烦了,例如当我在给定时间段内想要至少 20 个样本时,我正在寻找更通用的解决方案。我尝试研究 rollapply 和其他功能,但还没有成功。有没有更优雅(和有效)的方法来做到这一点?我更喜欢 tidyverse 解决方案,但 data.table 也可以。
library(tidyverse)
library(zoo)
df <- structure(list(subjectID = c("ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4"), time = c(2L, 2L, 5L, 5L, 6L, 7L, 9L, 14L, 16L, 22L, 28L, 31L, 38L, 40L, 42L, 48L, 53L, 58L, 63L, 69L, 0L, 4L, 8L, 9L, 12L, 14L, 16L, 23L, 29L, 30L, 32L, 38L, 42L, 47L, 51L, 57L, 60L, 63L, 67L, 69L, 1L, 1L, 3L, 6L, 8L, 10L, 23L, 26L, 29L, 30L, 34L, 40L, 41L, 41L, 46L, 51L, 53L, 56L, 57L, 69L, 6L, 10L, 10L, 12L, 16L, 21L, 28L, 29L, 31L, 35L, 39L, 40L, 41L, 42L, 44L, 48L, 50L, 52L, 54L, 60L), sampleID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), incl = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -80L), groups = structure(list( subjectID = c("ID1", "ID2", "ID3", "ID4"), .rows = structure(list( 1:20, 21:40, 41:60, 61:80), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), .drop = TRUE))
tr <- 4 # Time range
gr <- 4 # Group range
df <- df %>%
mutate(
incl2 = c(diff(time), NA) %>%
rollapply(gr-1, sum, fill = NA, align = "left") %>%
`<=`(tr) %>%
rollapply(., gr, any, na.rm = T, fill = ., align = "right")
)
identical(df$incl, df$incl2)
#> [1] TRUE
df
#> # A tibble: 80 × 5
#> # Groups: subjectID [4]
#> subjectID time sampleID incl incl2
#> <chr> <int> <int> <lgl> <lgl>
#> 1 ID1 2 1 TRUE TRUE
#> 2 ID1 2 2 TRUE TRUE
#> 3 ID1 5 3 TRUE TRUE
#> 4 ID1 5 4 TRUE TRUE
#> 5 ID1 6 5 TRUE TRUE
#> 6 ID1 7 6 TRUE TRUE
#> 7 ID1 9 7 TRUE TRUE
#> 8 ID1 14 8 FALSE FALSE
#> 9 ID1 16 9 FALSE FALSE
#> 10 ID1 22 10 FALSE FALSE
#> # … with 70 more rows
由 reprex package (v2.0.1)
于 2021-09-09 创建