比较 R 中多个时间序列的间隔
Comparing intervals across multiple time series in R
我有两个并发时间序列 A
和 B
,都包含由开始和结束时间定义的事件 - 这是一个示例:
A.df <- structure(list(A.eventid = 1:53,
A.start = structure(c(1563219814.52, 1563219852.37, 1563220313.16, 1563220472.66, 1563220704.35, 1563220879.51, 1563221108.24, 1563221158.33, 1563221387.43, 1563221400.7, 1563221602.34, 1563221828.33, 1563222165.52, 1563222314.2, 1563222557.28, 1563222669.44, 1563222905.52, 1563223091.62, 1563223237.19, 1563223273.64, 1563223580.14, 1563223908.66, 1563224093.27, 1563224497.41, 1563224554.64, 1563224705.57, 1563225011.55, 1563225192.59, 1563225305.14, 1563225414.38, 1563225432.21, 1563225898.61, 1563226034.51, 1563226110.18, 1563226206.49, 1563226528.13, 1563226570.18, 1563226788.53, 1563227026.21, 1563227502.2, 1563227709.3, 1563227832.51, 1563228127.44, 1563228188.4, 1563228293.59, 1563228558.39, 1563228680.32, 1563228819.44, 1563229208.51, 1563229282.14, 1563229528.52, 1563229959.21, 1563230268.65), class = c("POSIXct", "POSIXt")),
A.end = structure(c(1563219846.43, 1563220304.39, 1563220470.68, 1563220702.37, 1563220877.5, 1563221102.18, 1563221151.47, 1563221379.63, 1563221389.22, 1563221600.32, 1563221819.27, 1563222157.29, 1563222312.23, 1563222555.25, 1563222667.42, 1563222894.56, 1563223079.44, 1563223230.39, 1563223273.24, 1563223578.14, 1563223900.48, 1563224089.24, 1563224493.45, 1563224550.37, 1563224699.47, 1563225005.13, 1563225188.17, 1563225293.21, 1563225412.17, 1563225417.46, 1563225894.44, 1563226025.2, 1563226108.13, 1563226204.37, 1563226517.59, 1563226562.41, 1563226780.59, 1563227022.28, 1563227493.57, 1563227705.52, 1563227830.38, 1563228125.49, 1563228184.21, 1563228286.39, 1563228546.47, 1563228677.67, 1563228816.5, 1563229198.68, 1563229273.54, 1563229526.53, 1563229952.57, 1563230257.16, 1563230742.25), class = c("POSIXct", "POSIXt"))),
row.names = 1:53, class = "data.frame")
B.df <- structure(list(B.eventid = 1:52,
B.start = structure(c(1563221811.888, 1563222153.835, 1563222156.013, 1563222220.14, 1563222289.692, 1563222305.607, 1563222611.565, 1563222631.139, 1563222636.867, 1563222763.565, 1563222774.301, 1563222848.507, 1563222849.957, 1563222853.513, 1563223225.656, 1563223302.539, 1563223326.153, 1563223328.934, 1563223590.144, 1563223592.904, 1563224035.038, 1563224692.704, 1563226451.642, 1563226454.731, 1563226819.701, 1563226824.685, 1563227278.677, 1563227770.247, 1563227773.907, 1563227800.529, 1563227804.663, 1563227809.749, 1563227813.237, 1563227819.043, 1563227829.781, 1563227973.727, 1563229396.472, 1563229454.515, 1563229473.079, 1563229488.669, 1563229521.413, 1563229542.954, 1563229553.595, 1563229565.988, 1563229569.095, 1563229618.857, 1563229791.585, 1563229936.355, 1563230339.141, 1563230734.677, 1563231667.173, 1563231978.567), class = c("POSIXct", "POSIXt")),
B.end = structure(c(1563221815.058, 1563222154.295, 1563222158.633, 1563222222.07, 1563222289.872, 1563222308.617, 1563222614.265, 1563222633.509, 1563222640.367, 1563222769.045, 1563222774.801, 1563222848.677, 1563222850.237, 1563222856.103, 1563223226.166, 1563223305.339, 1563223328.763, 1563223333.234, 1563223591.454, 1563223593.084, 1563224043.618, 1563224695.234, 1563226454.622, 1563226456.771, 1563226822.551, 1563226827.225, 1563227282.067, 1563227771.787, 1563227774.477, 1563227802.199, 1563227806.653, 1563227811.569, 1563227817.897, 1563227823.643, 1563227830.351, 1563227978.177, 1563229401.282, 1563229457.905, 1563229478.359, 1563229492.439, 1563229527.723, 1563229545.694, 1563229558.975, 1563229568.658, 1563229571.255, 1563229621.117, 1563229792.055, 1563229952.055, 1563230344.351, 1563230739.647, 1563231672.983, 1563231979.987), class = c("POSIXct", "POSIXt"))),
row.names = 1:52, class = "data.frame")
系列 A
中的事件较长,而 B
中的事件较短。
我画了示意图来帮助解释:
对于每个 A
事件,其中 ≥ 4 B
事件发生,我想比较(也显示在原理图上):
X = the mean interval between B events occurring during the A event
与
Y = the interval between the last B event occuring during the A event, and the first B event occurring after the A event
我的问题是计算 X
和 Y
。
为了计算 X
,我尝试使用 foverlaps
根据发生的 A
事件对 B
事件进行分组。但是,这不包括 B
个发生在 A 事件间隔内的事件。
此外,我尝试使用 mutate
和 lag
计算分组 B
事件之间的平均间隔失败,因为我无法将 lag
限制为仅工作在组内(即它也计算组之间的间隔)。
最后,我不确定如何有效地识别 Y
间隔的 start/end 来计算其持续时间。
我以为我的 R/coding 正在进步,但这让我有点犹豫 - 非常感谢任何帮助!
我试图想出一个可能的解决方案,减去平均计算的部分,这应该是显而易见的。首先我重命名了列名,这样更容易加入数据集:
A.df = A.df %>%
rename_all(funs(str_replace(., "A.", ""))) %>%
mutate(type="A")
B.df = B.df %>%
rename_all(funs(str_replace(., "B.", ""))) %>%
mutate(type="B")
那么整体数据,按时间排序,是:
data = bind_rows(A.df, B.df) %>%
arrange(start)
现在我添加一列,显示 A
事件最后一次开始的时间戳。向前填充此值将为每个事件显示最后一个 A
事件的时间。
data = data %>%
mutate(last.A.start=ifelse(type=='A', start, NA)) %>%
tidyr::fill(last.A.start)
最后,A
事件可以删除了。只要last.A.start
相同,B
事件就属于同一个A
事件。根据这些信息可以计算出x
和y
。
data = data %>%
filter(type == "B") %>%
mutate(
duration=end-start, # Not needed.
delta=start - lag(end),
sameA=(last.A.start == lag(last.A.start)),
x=ifelse(sameA, delta, NA),
y=ifelse(sameA, NA, delta)
)
这有帮助吗?
最佳,M
假设你的 B 事件是按时间顺序排列的,不要相互重叠并且最多只能落在 1 A.event...
解释和中间输出在下面的代码中注释。
我无法验证输出,因为您在问题中没有提供 desired/expected 输出。结果乍一看对我来说似乎是合理的..
library(data.table)
setDT(A.df); setDT(B.df)
#get time to next B
B.df[, time.to.next.B := shift(B.start, type = "lead") - B.end ][]
#get A-event that the B-events falls into
B.df[ A.df,
A.eventid := i.A.eventid,
on = .(B.start >= A.start, B.end <= A.end )][]
# B.eventid B.start B.end time.to.next.B A.eventid
# 1: 1 2019-07-15 22:16:51 2019-07-15 22:16:55 338.777 secs 11
# 2: 2 2019-07-15 22:22:33 2019-07-15 22:22:34 1.718 secs 12
# 3: 3 2019-07-15 22:22:36 2019-07-15 22:22:38 61.507 secs NA
# 4: 4 2019-07-15 22:23:40 2019-07-15 22:23:42 67.622 secs 13
# 5: 5 2019-07-15 22:24:49 2019-07-15 22:24:49 15.735 secs 13
# 6: 6 2019-07-15 22:25:05 2019-07-15 22:25:08 302.948 secs 13
# ...
#summarise by A.eventid, get number of B-events, and B.eventid of last B-event
#only get A-eventis's with 4 or more B-events
ans <- B.df[ !is.na( A.eventid),
.( B.events = .N,
last.B.eventid = max( B.eventid ),
next.B.eventid = max( B.eventid ) + 1,
mean.B.interval.within.A = mean( time.to.next.B[ B.eventid != max( B.eventid ) ] ) ),
by = .(A.eventid) ][ B.events >= 4, ]
# A.eventid B.events last.B.eventid next.B.eventid mean.B.interval.within.A
# 1: 16 5 14 15 20.879500 secs
# 2: 41 8 35 36 6.097714 secs
# 3: 50 4 40 41 26.239000 secs
# 4: 51 7 48 49 62.953500 secs
#now find the needed intervals using an update joins
ans[ B.df, start_time := i.B.end, on = .(last.B.eventid = B.eventid)]
ans[ B.df, end_time := i.B.start, on = .(next.B.eventid = B.eventid)]
# A.eventid B.events last.B.eventid next.B.eventid mean.B.interval.within.A start_time end_time
# 1: 16 5 14 15 20.879500 secs 2019-07-15 22:34:16 2019-07-15 22:40:25
# 2: 41 8 35 36 6.097714 secs 2019-07-15 23:57:10 2019-07-15 23:59:33
# 3: 50 4 40 41 26.239000 secs 2019-07-16 00:24:52 2019-07-16 00:25:21
# 4: 51 7 48 49 62.953500 secs 2019-07-16 00:32:32 2019-07-16 00:38:59
X <- ans$mean.B.interval.within.A
# Time differences in secs
# [1] 20.879500 6.097714 26.239000 62.953500
Y <- ans$end_time - ans$start_time
# Time differences in secs
# [1] 369.553 143.376 28.974 387.086
我有两个并发时间序列 A
和 B
,都包含由开始和结束时间定义的事件 - 这是一个示例:
A.df <- structure(list(A.eventid = 1:53,
A.start = structure(c(1563219814.52, 1563219852.37, 1563220313.16, 1563220472.66, 1563220704.35, 1563220879.51, 1563221108.24, 1563221158.33, 1563221387.43, 1563221400.7, 1563221602.34, 1563221828.33, 1563222165.52, 1563222314.2, 1563222557.28, 1563222669.44, 1563222905.52, 1563223091.62, 1563223237.19, 1563223273.64, 1563223580.14, 1563223908.66, 1563224093.27, 1563224497.41, 1563224554.64, 1563224705.57, 1563225011.55, 1563225192.59, 1563225305.14, 1563225414.38, 1563225432.21, 1563225898.61, 1563226034.51, 1563226110.18, 1563226206.49, 1563226528.13, 1563226570.18, 1563226788.53, 1563227026.21, 1563227502.2, 1563227709.3, 1563227832.51, 1563228127.44, 1563228188.4, 1563228293.59, 1563228558.39, 1563228680.32, 1563228819.44, 1563229208.51, 1563229282.14, 1563229528.52, 1563229959.21, 1563230268.65), class = c("POSIXct", "POSIXt")),
A.end = structure(c(1563219846.43, 1563220304.39, 1563220470.68, 1563220702.37, 1563220877.5, 1563221102.18, 1563221151.47, 1563221379.63, 1563221389.22, 1563221600.32, 1563221819.27, 1563222157.29, 1563222312.23, 1563222555.25, 1563222667.42, 1563222894.56, 1563223079.44, 1563223230.39, 1563223273.24, 1563223578.14, 1563223900.48, 1563224089.24, 1563224493.45, 1563224550.37, 1563224699.47, 1563225005.13, 1563225188.17, 1563225293.21, 1563225412.17, 1563225417.46, 1563225894.44, 1563226025.2, 1563226108.13, 1563226204.37, 1563226517.59, 1563226562.41, 1563226780.59, 1563227022.28, 1563227493.57, 1563227705.52, 1563227830.38, 1563228125.49, 1563228184.21, 1563228286.39, 1563228546.47, 1563228677.67, 1563228816.5, 1563229198.68, 1563229273.54, 1563229526.53, 1563229952.57, 1563230257.16, 1563230742.25), class = c("POSIXct", "POSIXt"))),
row.names = 1:53, class = "data.frame")
B.df <- structure(list(B.eventid = 1:52,
B.start = structure(c(1563221811.888, 1563222153.835, 1563222156.013, 1563222220.14, 1563222289.692, 1563222305.607, 1563222611.565, 1563222631.139, 1563222636.867, 1563222763.565, 1563222774.301, 1563222848.507, 1563222849.957, 1563222853.513, 1563223225.656, 1563223302.539, 1563223326.153, 1563223328.934, 1563223590.144, 1563223592.904, 1563224035.038, 1563224692.704, 1563226451.642, 1563226454.731, 1563226819.701, 1563226824.685, 1563227278.677, 1563227770.247, 1563227773.907, 1563227800.529, 1563227804.663, 1563227809.749, 1563227813.237, 1563227819.043, 1563227829.781, 1563227973.727, 1563229396.472, 1563229454.515, 1563229473.079, 1563229488.669, 1563229521.413, 1563229542.954, 1563229553.595, 1563229565.988, 1563229569.095, 1563229618.857, 1563229791.585, 1563229936.355, 1563230339.141, 1563230734.677, 1563231667.173, 1563231978.567), class = c("POSIXct", "POSIXt")),
B.end = structure(c(1563221815.058, 1563222154.295, 1563222158.633, 1563222222.07, 1563222289.872, 1563222308.617, 1563222614.265, 1563222633.509, 1563222640.367, 1563222769.045, 1563222774.801, 1563222848.677, 1563222850.237, 1563222856.103, 1563223226.166, 1563223305.339, 1563223328.763, 1563223333.234, 1563223591.454, 1563223593.084, 1563224043.618, 1563224695.234, 1563226454.622, 1563226456.771, 1563226822.551, 1563226827.225, 1563227282.067, 1563227771.787, 1563227774.477, 1563227802.199, 1563227806.653, 1563227811.569, 1563227817.897, 1563227823.643, 1563227830.351, 1563227978.177, 1563229401.282, 1563229457.905, 1563229478.359, 1563229492.439, 1563229527.723, 1563229545.694, 1563229558.975, 1563229568.658, 1563229571.255, 1563229621.117, 1563229792.055, 1563229952.055, 1563230344.351, 1563230739.647, 1563231672.983, 1563231979.987), class = c("POSIXct", "POSIXt"))),
row.names = 1:52, class = "data.frame")
系列 A
中的事件较长,而 B
中的事件较短。
我画了示意图来帮助解释:
对于每个 A
事件,其中 ≥ 4 B
事件发生,我想比较(也显示在原理图上):
X = the mean interval between B events occurring during the A event
与
Y = the interval between the last B event occuring during the A event, and the first B event occurring after the A event
我的问题是计算 X
和 Y
。
为了计算 X
,我尝试使用 foverlaps
根据发生的 A
事件对 B
事件进行分组。但是,这不包括 B
个发生在 A 事件间隔内的事件。
此外,我尝试使用 mutate
和 lag
计算分组 B
事件之间的平均间隔失败,因为我无法将 lag
限制为仅工作在组内(即它也计算组之间的间隔)。
最后,我不确定如何有效地识别 Y
间隔的 start/end 来计算其持续时间。
我以为我的 R/coding 正在进步,但这让我有点犹豫 - 非常感谢任何帮助!
我试图想出一个可能的解决方案,减去平均计算的部分,这应该是显而易见的。首先我重命名了列名,这样更容易加入数据集:
A.df = A.df %>%
rename_all(funs(str_replace(., "A.", ""))) %>%
mutate(type="A")
B.df = B.df %>%
rename_all(funs(str_replace(., "B.", ""))) %>%
mutate(type="B")
那么整体数据,按时间排序,是:
data = bind_rows(A.df, B.df) %>%
arrange(start)
现在我添加一列,显示 A
事件最后一次开始的时间戳。向前填充此值将为每个事件显示最后一个 A
事件的时间。
data = data %>%
mutate(last.A.start=ifelse(type=='A', start, NA)) %>%
tidyr::fill(last.A.start)
最后,A
事件可以删除了。只要last.A.start
相同,B
事件就属于同一个A
事件。根据这些信息可以计算出x
和y
。
data = data %>%
filter(type == "B") %>%
mutate(
duration=end-start, # Not needed.
delta=start - lag(end),
sameA=(last.A.start == lag(last.A.start)),
x=ifelse(sameA, delta, NA),
y=ifelse(sameA, NA, delta)
)
这有帮助吗? 最佳,M
假设你的 B 事件是按时间顺序排列的,不要相互重叠并且最多只能落在 1 A.event...
解释和中间输出在下面的代码中注释。
我无法验证输出,因为您在问题中没有提供 desired/expected 输出。结果乍一看对我来说似乎是合理的..
library(data.table)
setDT(A.df); setDT(B.df)
#get time to next B
B.df[, time.to.next.B := shift(B.start, type = "lead") - B.end ][]
#get A-event that the B-events falls into
B.df[ A.df,
A.eventid := i.A.eventid,
on = .(B.start >= A.start, B.end <= A.end )][]
# B.eventid B.start B.end time.to.next.B A.eventid
# 1: 1 2019-07-15 22:16:51 2019-07-15 22:16:55 338.777 secs 11
# 2: 2 2019-07-15 22:22:33 2019-07-15 22:22:34 1.718 secs 12
# 3: 3 2019-07-15 22:22:36 2019-07-15 22:22:38 61.507 secs NA
# 4: 4 2019-07-15 22:23:40 2019-07-15 22:23:42 67.622 secs 13
# 5: 5 2019-07-15 22:24:49 2019-07-15 22:24:49 15.735 secs 13
# 6: 6 2019-07-15 22:25:05 2019-07-15 22:25:08 302.948 secs 13
# ...
#summarise by A.eventid, get number of B-events, and B.eventid of last B-event
#only get A-eventis's with 4 or more B-events
ans <- B.df[ !is.na( A.eventid),
.( B.events = .N,
last.B.eventid = max( B.eventid ),
next.B.eventid = max( B.eventid ) + 1,
mean.B.interval.within.A = mean( time.to.next.B[ B.eventid != max( B.eventid ) ] ) ),
by = .(A.eventid) ][ B.events >= 4, ]
# A.eventid B.events last.B.eventid next.B.eventid mean.B.interval.within.A
# 1: 16 5 14 15 20.879500 secs
# 2: 41 8 35 36 6.097714 secs
# 3: 50 4 40 41 26.239000 secs
# 4: 51 7 48 49 62.953500 secs
#now find the needed intervals using an update joins
ans[ B.df, start_time := i.B.end, on = .(last.B.eventid = B.eventid)]
ans[ B.df, end_time := i.B.start, on = .(next.B.eventid = B.eventid)]
# A.eventid B.events last.B.eventid next.B.eventid mean.B.interval.within.A start_time end_time
# 1: 16 5 14 15 20.879500 secs 2019-07-15 22:34:16 2019-07-15 22:40:25
# 2: 41 8 35 36 6.097714 secs 2019-07-15 23:57:10 2019-07-15 23:59:33
# 3: 50 4 40 41 26.239000 secs 2019-07-16 00:24:52 2019-07-16 00:25:21
# 4: 51 7 48 49 62.953500 secs 2019-07-16 00:32:32 2019-07-16 00:38:59
X <- ans$mean.B.interval.within.A
# Time differences in secs
# [1] 20.879500 6.097714 26.239000 62.953500
Y <- ans$end_time - ans$start_time
# Time differences in secs
# [1] 369.553 143.376 28.974 387.086