Select data.table 时间间隔内的日期行
Select rows by date within interval in data.table
我想 select 数据 table 中的观察值落在第二个数据 table 中指定的时间间隔内 - 时间间隔是观察结果的时间段同时由 2 个平台制作。
第一个数据 table 看起来像这样。这是一堆动物目击事件。
obs = data.table(sighting = as.POSIXct(c("2018-08-12 16:30:00", "2018-08-12 16:35:00", "2018-08-12 16:38:00", "2107-08-13 15:13:00", "2107-08-13 16:13:00", "2017-08-14 11:12:13"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"), encounter = c("1", "1", "1", "2", "3", "4"), what = c("frog", "frog", "toad", "bird", "goat","bird"))
观察来自 2 个平台。
platformA = data.table(station = "A", on.effort = as.POSIXct(c("2018-08-12 16:00:00", "2018-08-12 17:35:00","2017-08-14 11:00:13", "2018-08-15 17:35:00"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"), off.effort = as.POSIXct(c("2018-08-12 16:36:00", "2018-08-12 18:35:00","2017-08-14 12:12:13", "2018-08-15 18:35:00"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"))
platformB = data.table(station = "B", on.effort = as.POSIXct(c("2018-08-12 16:15:00", "2018-08-12 17:40:00", "2018-08-13 17:40:00","2017-08-14 11:05:13"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"), off.effort = as.POSIXct(c("2018-08-12 16:40:00", "2018-08-13 17:45:00", "2018-08-12 18:20:00","2017-08-14 12:30:13"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"))
我首先计算了每个平台的间隔,然后将间隔相交以找出同时进行观察的时间。
setkey(platformA, on.effort, off.effort)
setkey(platformB, on.effort, off.effort)
common = foverlaps(platformA, platformB,type="any",nomatch=0)
common$x = intersect(interval(common$on.effort, common$off.effort),
interval(common$i.on.effort, common$i.off.effort))
我想以 table 结束,它是 "obs" 的子集,并且仅包含 "common$x" 中的间隔所涵盖的行。我曾希望使用 foverlaps 来查找相交间隔中的行,并为我的观察结果创建了 "point" 间隔
obs[, sighting2 := sighting]
但是 foverlaps 想要每个间隔的 "start" 和 "end" 在单独的列中,这不是间隔存储在 common$x 中的方式。
我希望我的输出看起来像这样
sighting encounter what
2018-08-12 16:30:00 1 frog
2018-08-12 16:35:00 1 frog
2017-08-14 11:12:13 4 bird
如果有任何提示,我将不胜感激。也许我可以早点更有效率?
谢谢。
我相信这会让你得到你想要的。它没有利用 data.table
函数,并且完全在 base R 上运行。我不确定这是否会导致您的数据出现性能问题,但也许它提供了一种思考更多 data.table
-esque 函数。
library(data.table)
# Set up the data
obs = data.table(sighting = as.POSIXct(c("2018-08-12 16:30:00",
"2018-08-12 16:35:00",
"2018-08-12 16:38:00",
"2107-08-13 15:13:00",
"2107-08-13 16:13:00",
"2017-08-14 11:12:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"),
encounter = c("1", "1", "1", "2", "3", "4"),
what = c("frog", "frog", "toad", "bird", "goat","bird"))
platformA = data.table(station = "A",
on.effort = as.POSIXct(c("2018-08-12 16:00:00",
"2018-08-12 17:35:00",
"2017-08-14 11:00:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"),
off.effort = as.POSIXct(c("2018-08-12 16:36:00",
"2018-08-12 18:35:00",
"2017-08-14 12:12:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"))
platformB = data.table(station = "B",
on.effort = as.POSIXct(c("2018-08-12 16:15:00",
"2018-08-12 17:40:00",
"2017-08-14 11:05:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"),
off.effort = as.POSIXct(c("2018-08-12 16:40:00",
"2018-08-12 18:20:00",
"2017-08-14 12:30:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"))
# Get the start and end times for each observation (note use of pmax and pmin)
starts = pmax(platformA$on.effort, platformB$on.effort)
ends = pmin(platformA$off.effort, platformB$off.effort)
# For each sighting in obs check if it falls in between any of the intervals
seen = sapply(obs$sighting, function(x) {
any(x >= starts & x <= ends)
})
# Subset the data
obs[seen, ]
sighting encounter what
1: 2018-08-12 16:30:00 1 frog
2: 2018-08-12 16:35:00 1 frog
3: 2017-08-14 11:12:13 4 bird
此解决方案的主要方面是 start
和 end
的分配。由于我们要寻找两个平台上观察时间的交集,所以我们的开始时间是两个平台中较晚的时间(即最大值)和我们的结束时间是两个平台中最早的时间(即最小值)。通过使用 pmin
和 pmax
,我们可以分别获取时间向量的元素明智的最小值和最大值。在 x >= start & x <= min
中进行比较时,将单个时间 x
与一对时间 start[i]
和 end[i]
进行元素比较,从而为我们提供比较区间。
我认为即使平台之间的观察值不同,这也应该有效。如上所述使用您的 obs
、platformA
和 platformB
数据,使两个平台的间隔或多或少与您在 common
:
中所做的一样
common = intersect(interval(platformA$on.effort, platformA$off.effort),
interval(platformB$on.effort, platformB$off.effort))
您应该可以使用 %within%
来检查是否有任何情况下目击落在共同区间内:
obs$both.seen <- sapply(obs$sighting, function(s){
any(s %within% common)
})
或
obs[, both.seen := sapply(sighting, function(x) any(x %within% common))]
新 obs
:
> obs
sighting encounter what both.seen
1: 2018-08-12 16:30:00 1 frog TRUE
2: 2018-08-12 16:35:00 1 frog TRUE
3: 2018-08-12 16:38:00 1 toad FALSE
4: 2107-08-13 15:13:00 2 bird FALSE
5: 2107-08-13 16:13:00 3 goat FALSE
6: 2017-08-14 11:12:13 4 bird TRUE
子集以获得您想要的输出:
obs <- obs[both.seen == 1][, both.seen := NULL][]
> obs
sighting encounter what
1: 2018-08-12 16:30:00 1 frog
2: 2018-08-12 16:35:00 1 frog
3: 2017-08-14 11:12:13 4 bird
我想 select 数据 table 中的观察值落在第二个数据 table 中指定的时间间隔内 - 时间间隔是观察结果的时间段同时由 2 个平台制作。
第一个数据 table 看起来像这样。这是一堆动物目击事件。
obs = data.table(sighting = as.POSIXct(c("2018-08-12 16:30:00", "2018-08-12 16:35:00", "2018-08-12 16:38:00", "2107-08-13 15:13:00", "2107-08-13 16:13:00", "2017-08-14 11:12:13"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"), encounter = c("1", "1", "1", "2", "3", "4"), what = c("frog", "frog", "toad", "bird", "goat","bird"))
观察来自 2 个平台。
platformA = data.table(station = "A", on.effort = as.POSIXct(c("2018-08-12 16:00:00", "2018-08-12 17:35:00","2017-08-14 11:00:13", "2018-08-15 17:35:00"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"), off.effort = as.POSIXct(c("2018-08-12 16:36:00", "2018-08-12 18:35:00","2017-08-14 12:12:13", "2018-08-15 18:35:00"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"))
platformB = data.table(station = "B", on.effort = as.POSIXct(c("2018-08-12 16:15:00", "2018-08-12 17:40:00", "2018-08-13 17:40:00","2017-08-14 11:05:13"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"), off.effort = as.POSIXct(c("2018-08-12 16:40:00", "2018-08-13 17:45:00", "2018-08-12 18:20:00","2017-08-14 12:30:13"), format = "%Y-%m-%d %H:%M:%OS", tz = "America/Halifax"))
我首先计算了每个平台的间隔,然后将间隔相交以找出同时进行观察的时间。
setkey(platformA, on.effort, off.effort)
setkey(platformB, on.effort, off.effort)
common = foverlaps(platformA, platformB,type="any",nomatch=0)
common$x = intersect(interval(common$on.effort, common$off.effort),
interval(common$i.on.effort, common$i.off.effort))
我想以 table 结束,它是 "obs" 的子集,并且仅包含 "common$x" 中的间隔所涵盖的行。我曾希望使用 foverlaps 来查找相交间隔中的行,并为我的观察结果创建了 "point" 间隔
obs[, sighting2 := sighting]
但是 foverlaps 想要每个间隔的 "start" 和 "end" 在单独的列中,这不是间隔存储在 common$x 中的方式。
我希望我的输出看起来像这样
sighting encounter what
2018-08-12 16:30:00 1 frog
2018-08-12 16:35:00 1 frog
2017-08-14 11:12:13 4 bird
如果有任何提示,我将不胜感激。也许我可以早点更有效率? 谢谢。
我相信这会让你得到你想要的。它没有利用 data.table
函数,并且完全在 base R 上运行。我不确定这是否会导致您的数据出现性能问题,但也许它提供了一种思考更多 data.table
-esque 函数。
library(data.table)
# Set up the data
obs = data.table(sighting = as.POSIXct(c("2018-08-12 16:30:00",
"2018-08-12 16:35:00",
"2018-08-12 16:38:00",
"2107-08-13 15:13:00",
"2107-08-13 16:13:00",
"2017-08-14 11:12:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"),
encounter = c("1", "1", "1", "2", "3", "4"),
what = c("frog", "frog", "toad", "bird", "goat","bird"))
platformA = data.table(station = "A",
on.effort = as.POSIXct(c("2018-08-12 16:00:00",
"2018-08-12 17:35:00",
"2017-08-14 11:00:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"),
off.effort = as.POSIXct(c("2018-08-12 16:36:00",
"2018-08-12 18:35:00",
"2017-08-14 12:12:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"))
platformB = data.table(station = "B",
on.effort = as.POSIXct(c("2018-08-12 16:15:00",
"2018-08-12 17:40:00",
"2017-08-14 11:05:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"),
off.effort = as.POSIXct(c("2018-08-12 16:40:00",
"2018-08-12 18:20:00",
"2017-08-14 12:30:13"),
format = "%Y-%m-%d %H:%M:%OS",
tz = "America/Halifax"))
# Get the start and end times for each observation (note use of pmax and pmin)
starts = pmax(platformA$on.effort, platformB$on.effort)
ends = pmin(platformA$off.effort, platformB$off.effort)
# For each sighting in obs check if it falls in between any of the intervals
seen = sapply(obs$sighting, function(x) {
any(x >= starts & x <= ends)
})
# Subset the data
obs[seen, ]
sighting encounter what
1: 2018-08-12 16:30:00 1 frog
2: 2018-08-12 16:35:00 1 frog
3: 2017-08-14 11:12:13 4 bird
此解决方案的主要方面是 start
和 end
的分配。由于我们要寻找两个平台上观察时间的交集,所以我们的开始时间是两个平台中较晚的时间(即最大值)和我们的结束时间是两个平台中最早的时间(即最小值)。通过使用 pmin
和 pmax
,我们可以分别获取时间向量的元素明智的最小值和最大值。在 x >= start & x <= min
中进行比较时,将单个时间 x
与一对时间 start[i]
和 end[i]
进行元素比较,从而为我们提供比较区间。
我认为即使平台之间的观察值不同,这也应该有效。如上所述使用您的 obs
、platformA
和 platformB
数据,使两个平台的间隔或多或少与您在 common
:
common = intersect(interval(platformA$on.effort, platformA$off.effort),
interval(platformB$on.effort, platformB$off.effort))
您应该可以使用 %within%
来检查是否有任何情况下目击落在共同区间内:
obs$both.seen <- sapply(obs$sighting, function(s){
any(s %within% common)
})
或
obs[, both.seen := sapply(sighting, function(x) any(x %within% common))]
新 obs
:
> obs
sighting encounter what both.seen
1: 2018-08-12 16:30:00 1 frog TRUE
2: 2018-08-12 16:35:00 1 frog TRUE
3: 2018-08-12 16:38:00 1 toad FALSE
4: 2107-08-13 15:13:00 2 bird FALSE
5: 2107-08-13 16:13:00 3 goat FALSE
6: 2017-08-14 11:12:13 4 bird TRUE
子集以获得您想要的输出:
obs <- obs[both.seen == 1][, both.seen := NULL][]
> obs
sighting encounter what
1: 2018-08-12 16:30:00 1 frog
2: 2018-08-12 16:35:00 1 frog
3: 2017-08-14 11:12:13 4 bird