r 根据灵活时间内出现的次数筛选行 window

r filter rows based on number of occurrences within flexible time window

(简化的)情况如下:我有一个包含 subjectID(分组变量)、sampleID(每个 subjectID 组中的唯一变量)和 sampleTime(收集 sampleID 的时间)的数据框。如果在任何给定的 4 小时内收集了至少 4 个样本,我只想过滤这些行。例如,可以包含一个样本,因为一个样本是在当前样本之前 2 小时收集的,而另外两个样本是在当前样本之后 1 小时和 2 小时收集的。另一个例子:如果前一个样本是在当前样本之前 5 小时收集的,而接下来的三个样本是在当前样本之前 1 小时、2 小时和 6 小时收集的,则不应包括该样本。时间window可以两边都开,偶尔可以同时采集两个样本。

在输入这个问题时,我找到了一个适用于这个简单案例并表明我的意图的答案:

library(tidyverse)
set.seed(6354363)

#create some data
df <- data.frame(subjectID = rep(paste0("ID", 1:4), each=20)) %>% 
  group_by(subjectID) %>% 
  mutate(time=sample(0:72, size=20, replace=T))  %>% 
  arrange(subjectID, time) %>% 
  mutate(sampleID = 1:20)

#create column incl to mark which rows to include
df <- df %>% group_by(subjectID) %>% 
  mutate(incl = case_when(time-lag(time, n=3) <= 4 ~ T,
                          lead(time, n=3) - time <= 4 ~ T,
                          lead(time, n=2) - lag(time, n=1) <= 4 ~ T,
                          lead(time, n=1) - lag(time, n=2) <= 4 ~ T)) 

#filter gives intended solution
df %>% filter(incl)

但是,我觉得这种方法不必要地复杂并且在其他情况下变得太麻烦了,例如当我在给定时间段内想要至少 20 个样本时,我正在寻找更通用的解决方案。我尝试研究 rollapply 和其他功能,但还没有成功。有没有更优雅(和有效)的方法来做到这一点?我更喜欢 tidyverse 解决方案,但 data.table 也可以。

library(tidyverse)
library(zoo)

df <- structure(list(subjectID = c("ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID2", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID3", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4", "ID4"), time = c(2L, 2L, 5L, 5L, 6L, 7L, 9L, 14L, 16L, 22L, 28L, 31L, 38L, 40L, 42L, 48L, 53L, 58L, 63L, 69L, 0L, 4L, 8L, 9L, 12L, 14L, 16L, 23L, 29L, 30L, 32L, 38L, 42L, 47L, 51L, 57L, 60L, 63L, 67L, 69L, 1L, 1L, 3L, 6L, 8L, 10L, 23L, 26L, 29L, 30L, 34L, 40L, 41L, 41L, 46L, 51L, 53L, 56L, 57L, 69L, 6L, 10L, 10L, 12L, 16L, 21L, 28L, 29L, 31L, 35L, 39L, 40L, 41L, 42L, 44L, 48L, 50L, 52L, 54L, 60L), sampleID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), incl = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -80L), groups = structure(list(    subjectID = c("ID1", "ID2", "ID3", "ID4"), .rows = structure(list(        1:20, 21:40, 41:60, 61:80), ptype = integer(0), class = c("vctrs_list_of",     "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -4L), .drop = TRUE))

tr <- 4 # Time range
gr <- 4 # Group range

df <- df %>%
  mutate(
    incl2 = c(diff(time), NA) %>%
      rollapply(gr-1, sum, fill = NA, align = "left") %>%
      `<=`(tr) %>%
      rollapply(., gr, any, na.rm = T, fill = ., align = "right")
  ) 

identical(df$incl, df$incl2)
#> [1] TRUE

df
#> # A tibble: 80 × 5
#> # Groups:   subjectID [4]
#>    subjectID  time sampleID incl  incl2
#>    <chr>     <int>    <int> <lgl> <lgl>
#>  1 ID1           2        1 TRUE  TRUE 
#>  2 ID1           2        2 TRUE  TRUE 
#>  3 ID1           5        3 TRUE  TRUE 
#>  4 ID1           5        4 TRUE  TRUE 
#>  5 ID1           6        5 TRUE  TRUE 
#>  6 ID1           7        6 TRUE  TRUE 
#>  7 ID1           9        7 TRUE  TRUE 
#>  8 ID1          14        8 FALSE FALSE
#>  9 ID1          16        9 FALSE FALSE
#> 10 ID1          22       10 FALSE FALSE
#> # … with 70 more rows

reprex package (v2.0.1)

于 2021-09-09 创建