基于多个迭代条件对事件时间数据进行分组

Grouping of Event Time Data based on multiple, iterative conditions

我有一个约 190,000 行的数据集,其中包括: 示例数据:found here:

 > df[1:100,1:6]
    AcousticTagCode      Species SiteCode         StartDetection           EndDetection Duration_min
1           5004.24 Striped Bass     RGD1 2014-10-01 23:01:12.12 2014-10-01 23:59:41.41    58.488167
2           5004.24 Striped Bass     RGD1 2014-10-02 00:00:06.06 2014-10-02 01:00:00.00    59.892167
3           5004.24 Striped Bass     RGD1 2014-10-02 01:00:01.01 2014-10-02 01:20:12.12    20.185167
4           5004.24 Striped Bass     RGD1 2014-10-02 04:14:15.15 2014-10-02 04:32:16.16    18.016833
5           5004.24 Striped Bass     RGD1 2014-10-02 22:00:06.06 2014-10-02 22:59:54.54    59.791167
6           5004.24 Striped Bass     RGD1 2014-10-02 23:00:10.10 2014-10-02 23:59:55.55    59.740667
7           5004.24 Striped Bass     RGD1 2014-10-03 00:00:08.08 2014-10-03 00:59:46.46    59.639667
8           5004.24 Striped Bass     RGD1 2014-10-03 01:00:10.10 2014-10-03 01:58:18.18    58.134667
9           5004.24 Striped Bass     RGD1 2014-10-03 02:05:05.05 2014-10-03 02:36:11.11    31.101000
10          5004.24 Striped Bass     RGD1 2014-10-03 04:01:03.03 2014-10-03 04:43:35.35    42.538667
11          5004.24 Striped Bass     RGD1 2014-10-03 06:00:15.15 2014-10-03 06:48:23.23    48.134667
12          5004.24 Striped Bass     RGD1 2014-10-03 07:02:00.00 2014-10-03 07:57:33.33    55.555500
13          5004.24 Striped Bass     RGD1 2014-10-03 08:04:27.27 2014-10-03 08:59:19.19    54.865333
14          5004.24 Striped Bass     RGD1 2014-10-03 09:01:03.03 2014-10-03 09:59:36.36    58.555500
15          5004.24 Striped Bass     RGD1 2014-10-03 10:00:33.33 2014-10-03 10:58:50.50    58.286167
16          5004.24 Striped Bass     RGD1 2014-10-03 11:00:02.02 2014-10-03 11:59:56.56    59.892167
17          5004.24 Striped Bass     RGD1 2014-10-03 12:00:10.10 2014-10-03 12:18:01.01    17.848500
18          5004.24 Striped Bass     RGD1 2014-10-03 13:08:56.56 2014-10-03 13:24:06.06    15.175167
19          5004.24 Striped Bass     RGD1 2014-10-03 14:29:00.00 2014-10-03 14:58:52.52    29.865333
20          5004.24 Striped Bass     RGD1 2014-10-03 15:00:05.05 2014-10-03 15:59:17.17    59.202000
21          5004.24 Striped Bass     RGD1 2014-10-03 16:05:47.47 2014-10-03 16:59:50.50    54.050500
22          5004.24 Striped Bass     RGD1 2014-10-03 17:00:05.05 2014-10-03 17:43:37.37    43.538667
23          5004.24 Striped Bass     RGD1 2014-10-03 18:02:10.10 2014-10-03 18:38:58.58    36.791167
24          5004.24 Striped Bass     RGD1 2014-10-03 19:03:44.44 2014-10-03 19:58:26.26    54.697000
25          5004.24 Striped Bass     RGD1 2014-10-03 20:09:42.42 2014-10-03 20:58:24.24    48.697000
26          5004.24 Striped Bass     RGD1 2014-10-03 21:00:05.05 2014-10-03 21:59:03.03    58.966333
27          5004.24 Striped Bass     RGD1 2014-10-03 22:00:23.23 2014-10-03 22:59:46.46    59.387167
28          5004.24 Striped Bass     RGD1 2014-10-03 23:00:41.41 2014-10-03 23:59:29.29    58.798000
29          5004.24 Striped Bass     RGD1 2014-10-04 09:16:18.18 2014-10-04 09:59:35.35    43.286167
30          5004.24 Striped Bass     RGD1 2014-10-04 10:00:05.05 2014-10-04 10:59:18.18    59.218833
31          5004.24 Striped Bass     RGD1 2014-10-04 11:00:05.05 2014-10-04 11:59:59.59    59.892167
32          5004.24 Striped Bass     RGD1 2014-10-04 12:00:01.01 2014-10-04 12:59:49.49    59.808000
33          5004.24 Striped Bass     RGD1 2014-10-04 13:00:23.23 2014-10-04 13:42:25.25    42.033667
34          5004.24 Striped Bass     RGD1 2014-10-04 14:00:55.55 2014-10-04 14:53:26.26    52.528667
35          5004.24 Striped Bass     RGD1 2014-10-04 15:00:32.32 2014-10-04 15:24:24.24    23.865333
36          5004.24 Striped Bass     RGD1 2014-10-04 17:20:04.04 2014-10-04 17:24:09.09     4.084167
37          5004.24 Striped Bass     RGD1 2014-10-04 18:23:54.54 2014-10-04 18:52:30.30    28.612833
38          5004.24 Striped Bass     RGD1 2014-10-04 19:04:09.09 2014-10-04 19:59:58.58    55.808000
39          5004.24 Striped Bass     RGD1 2014-10-04 20:00:02.02 2014-10-04 20:59:56.56    59.892167
40          5004.24 Striped Bass     RGD1 2014-10-04 21:00:00.00 2014-10-04 21:59:59.59    59.976333
41          5004.24 Striped Bass     RGD1 2014-10-04 22:00:03.03 2014-10-04 22:59:57.57    59.892167
42          5004.24 Striped Bass     RGD1 2014-10-04 23:00:13.13 2014-10-04 23:59:57.57    59.723833
43          5004.24 Striped Bass     RGD1 2014-10-05 00:00:00.00 2014-10-05 00:59:43.43    59.723833
44          5004.24 Striped Bass     RGD1 2014-10-05 01:01:34.34 2014-10-05 01:59:58.58    58.387167
45          5004.24 Striped Bass     RGD1 2014-10-05 02:00:02.02 2014-10-05 02:57:00.00    56.959500
46          5004.24 Striped Bass     RGD1 2014-10-05 03:08:26.26 2014-10-05 03:58:33.33    50.117833
47          5004.24 Striped Bass     RGD1 2014-10-05 04:00:59.59 2014-10-05 04:58:36.36    57.629667
48          5004.24 Striped Bass     RGD1 2014-10-05 05:03:22.22 2014-10-05 05:54:09.09    50.781167
49          5004.24 Striped Bass     RGD1 2014-10-05 06:00:40.40 2014-10-05 06:57:54.54    57.218833
50          5004.24 Striped Bass     RGD1 2014-10-05 07:11:13.13 2014-10-05 07:59:52.52    48.639667
51          5004.24 Striped Bass     RGD1 2014-10-05 08:00:11.11 2014-10-05 08:59:55.55    59.723833
52          5004.24 Striped Bass     RGD1 2014-10-05 09:00:43.43 2014-10-05 09:59:52.52    59.134667
53          5004.24 Striped Bass     RGD1 2014-10-05 10:00:22.22 2014-10-05 10:56:15.15    55.882167
54          5004.24 Striped Bass     RGD1 2014-10-05 11:02:31.31 2014-10-05 11:53:29.29    50.966333
55          5004.24 Striped Bass     RGD1 2014-10-05 13:54:22.22 2014-10-05 13:59:12.12     4.831667
56          5004.24 Striped Bass     RGD1 2014-10-05 22:00:40.40 2014-10-05 22:59:59.59    59.303000
57          5004.24 Striped Bass     RGD1 2014-10-05 23:00:03.03 2014-10-05 23:59:37.37    59.572333
58          5004.24 Striped Bass     RGD1 2014-10-06 00:00:36.36 2014-10-06 00:59:19.19    58.713833
59          5004.24 Striped Bass     RGD1 2014-10-06 01:00:00.00 2014-10-06 01:59:54.54    59.892167
60          5004.24 Striped Bass     RGD1 2014-10-06 02:00:38.38 2014-10-06 02:59:46.46    59.134667
61          5004.24 Striped Bass     RGD1 2014-10-06 03:03:03.03 2014-10-06 03:59:16.16    56.218833
62          5004.24 Striped Bass     RGD1 2014-10-06 04:00:11.11 2014-10-06 04:36:28.28    36.286167
63          5004.24 Striped Bass     RGD1 2014-10-06 05:16:11.11 2014-10-06 05:58:33.33    42.370333
64          5004.24 Striped Bass     RGD1 2014-10-06 12:00:40.40 2014-10-06 12:58:17.17    57.612833
65          5004.24 Striped Bass     RGD1 2014-10-06 18:02:17.17 2014-10-06 18:12:23.23    10.101000
66          5004.24 Striped Bass     RGD1 2014-10-06 19:44:35.35 2014-10-06 19:58:00.00    13.410833
67          5004.24 Striped Bass     RGD1 2014-10-06 20:02:00.00 2014-10-06 20:59:59.59    57.976333
68          5004.24 Striped Bass     RGD1 2014-10-06 21:00:03.03 2014-10-06 21:43:15.15    43.202000
69          5004.24 Striped Bass     RGD1 2014-10-06 22:21:58.58 2014-10-06 22:59:49.49    37.865333
70          5004.24 Striped Bass     RGD1 2014-10-06 23:00:35.35 2014-10-06 23:57:08.08    56.545500
71          5004.24 Striped Bass     RGD1 2014-10-07 00:01:01.01 2014-10-07 00:59:19.19    58.303000
72          5004.24 Striped Bass     RGD1 2014-10-07 01:01:32.32 2014-10-07 01:53:55.55    52.370333
73          5004.24 Striped Bass     RGD1 2014-10-07 02:14:45.45 2014-10-07 02:59:33.33    44.798000
74          5004.24 Striped Bass     RGD1 2014-10-07 03:15:54.54 2014-10-07 03:59:57.57    44.050500
75          5004.24 Striped Bass     RGD1 2014-10-07 04:00:05.05 2014-10-07 04:31:31.31    31.437667
76          5004.24 Striped Bass     RGD1 2014-10-07 05:33:56.56 2014-10-07 05:59:16.16    25.343500
77          5004.24 Striped Bass     RGD1 2014-10-07 06:32:00.00 2014-10-07 06:43:00.00    11.006833
78          5004.24 Striped Bass     RGD1 2014-10-07 07:02:25.25 2014-10-07 07:29:22.22    26.949500
79          5004.24 Striped Bass     RGD1 2014-10-07 08:00:43.43 2014-10-07 08:51:26.26    50.713833
80          5004.24 Striped Bass     RGD1 2014-10-07 09:04:32.32 2014-10-07 09:46:55.55    42.370333
81          5004.24 Striped Bass     RGD1 2014-10-07 10:03:05.05 2014-10-07 10:32:47.47    29.707000
82          5004.24 Striped Bass     RGD1 2014-10-07 11:52:15.15 2014-10-07 11:59:56.56     7.673333
83          5004.24 Striped Bass     RGD1 2014-10-07 12:00:02.02 2014-10-07 12:42:19.19    42.286167
84          5004.24 Striped Bass     RGD1 2014-10-07 13:03:10.10 2014-10-07 13:59:59.59    56.808000
85          5004.24 Striped Bass     RGD1 2014-10-07 20:47:56.56 2014-10-07 20:50:00.00     2.074167
86          5004.24 Striped Bass     RGD1 2014-10-07 21:27:12.12 2014-10-07 21:59:08.08    31.932667
87          5004.24 Striped Bass     RGD1 2014-10-07 22:02:49.49 2014-10-07 22:59:16.16    56.444500
88          5004.24 Striped Bass     RGD1 2014-10-07 23:00:27.27 2014-10-07 23:58:00.00    57.545500
89          5004.24 Striped Bass     RGD1 2014-10-08 00:01:07.07 2014-10-08 01:00:00.00    58.882167
90          5004.24 Striped Bass     RGD1 2014-10-08 01:00:09.09 2014-10-08 01:59:57.57    59.791167
91          5004.24 Striped Bass     RGD1 2014-10-08 02:00:05.05 2014-10-08 02:59:03.03    58.966333
92          5004.24 Striped Bass     RGD1 2014-10-08 03:04:10.10 2014-10-08 03:55:12.12    51.033667
93          5004.24 Striped Bass     RGD1 2014-10-08 05:26:26.26 2014-10-08 05:59:28.28    33.033667
94          5004.24 Striped Bass     RGD1 2014-10-08 06:02:49.49 2014-10-08 06:59:58.58    57.134667
95          5004.24 Striped Bass     RGD1 2014-10-08 07:00:02.02 2014-10-08 07:59:40.40    59.639667
96          5004.24 Striped Bass     RGD1 2014-10-08 08:00:07.07 2014-10-08 08:59:50.50    59.723833
97          5004.24 Striped Bass     RGD1 2014-10-08 09:01:13.13 2014-10-08 09:51:35.35    50.370333
98          5004.24 Striped Bass     RGD1 2014-10-08 10:04:53.53 2014-10-08 10:59:09.09    54.276167
99          5004.24 Striped Bass     RGD1 2014-10-08 11:06:27.27 2014-10-08 11:31:23.23    24.932667
100         5004.24 Striped Bass     RGD1 2014-10-08 20:03:30.30 2014-10-08 20:59:59.59    56.471333
  1. 唯一的个人标识符"AcousticTagCode"
  2. 个体的物种"Species"
  3. 观察点"SiteCode"
  4. 事件的开始时间"StartDetection"
  5. 事件的结束时间"EndDetection"
  6. 活动持续时间"Duration_min"

      AcousticTagCode      Species SiteCode         StartDetection           EndDetection Duration_min
    1         5004.24 Striped Bass     RGD1 2014-10-01 23:01:12.12 2014-10-01 23:59:41.41     58.48817
    2         5004.24 Striped Bass     RGD1 2014-10-02 00:00:06.06 2014-10-02 01:00:00.00     59.89217
    3         5004.24 Striped Bass     RGD1 2014-10-02 01:00:01.01 2014-10-02 01:20:12.12     20.18517
    4         5004.24 Striped Bass     RGD1 2014-10-02 04:14:15.15 2014-10-02 04:32:16.16     18.01683
    5         5004.24 Striped Bass     RGD1 2014-10-02 22:00:06.06 2014-10-02 22:59:54.54     59.79117
    6         5004.24 Striped Bass     RGD1 2014-10-02 23:00:10.10 2014-10-02 23:59:55.55     59.74067
    

recent paper 中进行了一项分析,作者使用生存分析通过定义最佳时间间隔或最大消隐期* (MBP*) 来确定站点的持续停留时间 (CRT)连续检测到此人可能仍居住在现场,但不在检测范围内。

基本大纲是这样的:

定义 1 mbp 的初始时间间隔

mbp <- 7 #seconds

创建一个整数列表以乘以 mbp 以测试不同时间间隔的生存分析

n = c(1,2,4,8,16,32,64,128,256,512)

这就是我卡住的地方。对于 n 的每个值,我需要确定是否

n*mbp > StartDetection of Event[i+1] - EndDetection of Event[i]  

如果上述评估为真,我添加所有事件的持续时间,直到上述评估为假的事件或 TagCode 更改或 SiteCode 更改。

例如上面第2行的n=1mbp = 7EndDetection的时间是01:00:00.00,新事件的StartDetection时间(第 3 行)是 01:00:01.01。区别是 1.01 秒,小于 n*mbp,因此我将第 2 行和第 3 行的持续时间添加为新变量 CRT。问题是当超过 2 个连续事件都满足以下条件时:

   AcousticTagCode      Species SiteCode         StartDetection           EndDetection Duration_min
38         5004.24 Striped Bass     RGD1 2014-10-04 19:04:09.09 2014-10-04 19:59:58.58     55.80800
39         5004.24 Striped Bass     RGD1 2014-10-04 20:00:02.02 2014-10-04 20:59:56.56     59.89217
40         5004.24 Striped Bass     RGD1 2014-10-04 21:00:00.00 2014-10-04 21:59:59.59     59.97633
41         5004.24 Striped Bass     RGD1 2014-10-04 22:00:03.03 2014-10-04 22:59:57.57     59.89217

这里,第 38-41 行的事件都在 7 秒内发生,AcousticTagCode 相同,SiteCode 保持为 RGD1,因此需要将所有事件持续时间加在一起并称为 CRT

有没有一种方法可以让我在此处获取初始数据集并为 n 的每次迭代创建一个新的数据框 (df_n),其中包含以下列:

AcousticTagCode Species SiteCode CRT

使用上面的两个示例,这看起来像下面的第 2 行和第 n 行:

head(df_1)
  AcousticTagCode      Species SiteCode    CRT
1         5004.24 Striped Bass     RGD1  58.49
2         5004.24 Striped Bass     RGD1  80.08
...
i         5004.24 Striped Bass     RGD1 235.57

此外,只要不满足条件,事件就会被视为独立事件,并且 CRT = 持续时间,如上面 table 第 1 行所示。

我的 R 技能相当初级,我确信有一种简单的方法可以做到这一点,但我不知道,而且我的搜索技能对我没有任何帮助

示例数据:found here:

听起来您正在尝试做两件事:1) 对于声学、物种、站点代码和开始结束时间的每个独特组合,找到所有其他匹配的声学-物种-站点案例,其开始时间是<= (mbp + 原组合的结束时间),然后加上duration得到CRT; 2) 对 mbp x n 的每个值重复此过程。

由于您没有包含任何示例数据,我创建了一些示例数据来尝试匹配您的数据:100 个案例,其中声学 a 可以是 1 或 2,species 可以是'bass'或'trout',site可以是'p'或'q',一些随机值,开始时间和结束时间,每次结束5-15秒后随机结束开始时间。

set.seed(123)
df <- data.frame(a=sample(1:2,100,T),species=sample(c('bass','trout'),100,T),
                 site=sample(c('p','q'),100,T),
                 value=round(runif(100),2),
                 start=sample(seq(c(ISOdate(2000,1,1,0,0,0)), by = "sec", length.out = 100),100,F),
                 stringsAsFactors = F)
df$end <- df$start + sample(c(5:15),length(df$start),T)

要完成第 1 点,我认为您可以采用这种方法:使用 complete 在每个组合的 7 秒内创建时间序列。然后使用可能的序列加入您的原始数据。例如,如果您有一行 1-bass-p 和结束时间 00:00:21,则可能的匹配项是其他 1-bass-p 行的开始时间从 00:00:22 到 00:00:28 .

df1 <- df %>% arrange(a,species,site,end) %>% 
  distinct(a,species,site,start,end) %>% 
  mutate(end.orig = end) %>% 
  group_by(a,species,site,start,end,end.orig) %>% 
  complete(end=seq(from=end,to=(end+7),by='sec'))

df.orig <- df1 %>% select(-end.orig) %>% left_join(.,df) %>% filter(!is.na(value))

df.match <- df1 %>% ungroup %>% select(-start) %>% rename(start=end) %>% left_join(.,df) %>% 
  filter(!is.na(value))

bind_rows(df.orig %>% ungroup %>% select(a,species,site,value,end.orig),
          df.match %>% ungroup %>% select(a,species,site,value,end.orig)) %>% 
  group_by(a,species,site,end.orig) %>% 
  summarise(crt=sum(value),n=n())

# A tibble: 97 x 6
# Groups:   a, species, site [8]
       a species site  end.orig              crt     n
   <int> <chr>   <chr> <dttm>              <dbl> <int>
 1     1 bass    p     2000-01-01 00:00:21  1.58     3
 2     1 bass    p     2000-01-01 00:00:26  1.38     4
 3     1 bass    p     2000-01-01 00:00:36  2.27     4
 4     1 bass    p     2000-01-01 00:00:42  1.69     3
 5     1 bass    p     2000-01-01 00:00:46  1.23     2
 6     1 bass    p     2000-01-01 00:00:55  0.84     1
 7     1 bass    p     2000-01-01 00:01:02  1.32     2
 8     1 bass    p     2000-01-01 00:01:18  0.74     2
 9     1 bass    p     2000-01-01 00:01:29  0.54     2
10     1 bass    p     2000-01-01 00:01:42  0.42     1
# ... with 87 more rows

要完成 2,您可以使用 lapply 对您的 mbp x n 值重复此过程。这将 return 包含 10 个元素的列表(即 n x mbp 的长度)。

mbp <- 7; n <- c(1,2,4,8,16,32,64,128,256,512) * mbp

f <- lapply(1:length(n), function(x){
  df1 <- df %>% arrange(a,species,site,end) %>% 
    distinct(a,species,site,start,end) %>% 
    mutate(end.orig = end) %>% 
    group_by(a,species,site,start,end,end.orig) %>% 
    complete(end=seq(from=end,to=(end + n[x]),by='sec')) ### x is here

  df.orig <- df1 %>% select(-end.orig) %>% left_join(.,df) %>% filter(!is.na(value))

  df.match <- df1 %>% ungroup %>% select(-start) %>% rename(start=end) %>% left_join(.,df) %>% 
    filter(!is.na(value))

  out <- bind_rows(df.orig %>% ungroup %>% select(a,species,site,value,end.orig),
            df.match %>% ungroup %>% select(a,species,site,value,end.orig)) %>% 
    group_by(a,species,site,end.orig) %>% 
    summarise(crt=sum(value),n=n())

  return(out)
})