完成并归档多个时间序列,开始和结束之间的所有秒数在 R 中的一个 data.table 内
Completing and filling several timeseries with all seconds between start and end within one data.table in R
我有一个 data.table
看起来像这样:
sample <- structure(list(TimeStamp = structure(c(1629036673.5, 1629036676.35,
1629036760.29, 1629036760.95, 1629036769.3, 1629037223.1, 1629037237.81,
1629037241.6, 1629037243.34), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Hen = c("CXU", "CXU", "CXU", "CXU", "CXU", "CCE", "CCE",
"CCE", "CCE"), Units = c("M2", "M2", "HM2", "HM2", "H2",
"HM2", "HM2", "M2", "M2"), dummy = c(1L, 1L, 2L, 2L, 3L,
103L, 103L, 104L, 104L), Timing = c("Start", "End", "Start",
"End", "Start", "Start", "End", "Start", "End")), row.names = c(NA,
-9L), class = c("data.table", "data.frame"))
> sample
TimeStamp Hen Units dummy Timing
1: 2021-08-15 14:11:13 CXU M2 1 Start
2: 2021-08-15 14:11:16 CXU M2 1 End
3: 2021-08-15 14:12:40 CXU HM2 2 Start
4: 2021-08-15 14:12:40 CXU HM2 2 End
5: 2021-08-15 14:12:49 CXU H2 3 Start
6: 2021-08-15 14:20:23 CCE HM2 103 Start
7: 2021-08-15 14:20:37 CCE HM2 103 End
8: 2021-08-15 14:20:41 CCE M2 104 Start
9: 2021-08-15 14:20:43 CCE M2 104 End
我需要的是按开始和结束指示的秒数完成每个时间序列(有时只有一个开始存在,这意味着条目既是开始又是结束)并填充一些变量。我能够使用 complete
和 fill
实现这一点(使用包 tidyr
和 dplyr
):
> sample %>%
+ group_by(dummy) %>%
+ complete(TimeStamp = seq(first(TimeStamp), max(TimeStamp), by = "sec")) %>%
+ fill(Hen, Units, dummy)
# A tibble: 26 x 5
# Groups: dummy [5]
dummy TimeStamp Hen Units Timing
<int> <dttm> <chr> <chr> <chr>
1 1 2021-08-15 14:11:13 CXU M2 Start
2 1 2021-08-15 14:11:14 CXU M2 NA
3 1 2021-08-15 14:11:15 CXU M2 NA
4 1 2021-08-15 14:11:16 CXU M2 End
5 2 2021-08-15 14:12:40 CXU HM2 Start
6 2 2021-08-15 14:12:40 CXU HM2 End
7 3 2021-08-15 14:12:49 CXU H2 Start
8 103 2021-08-15 14:20:23 CCE HM2 Start
9 103 2021-08-15 14:20:24 CCE HM2 NA
10 103 2021-08-15 14:20:25 CCE HM2 NA
# ... with 16 more rows
我的问题是我使用 data.table 是有原因的,我有大量的数据,而我找到的解决方案运行得太慢了。我的想法是通过首先创建一个单独的时间序列并将两者合并来使用更多 data.table 基本方法。然后可以使用 zoo
中的 na.locf
完成填充。但是,我无法以任何有效的方式创建时间序列。有人有什么想法吗?
试试这个方法...
此外,您在时间戳中使用小数,我建议在创建序列之前四舍五入。
library(data.table)
library(zoo)
# you need to round your decimal timestamps to seconds
# use ceiling() or floor() if you prefer
sample[, TimeStamp := as.POSIXct(round(TimeStamp))]
# Create a data.table with min-max sequences
DT <- sample[, .(TimeStamp = seq(min(TimeStamp), max(TimeStamp), by = 1)), by = .(dummy)]
# Perform update join
DT[sample,
`:=`(Hen = i.Hen, Units = i.Units, Timing = i.Timing),
on = .(dummy, TimeStamp)]
# Fill down NA's Hen and Units columns
# data.table's setnafill doens not (yet?) support character columns
# so we use zoo:na.locf()
DT[, c("Hen", "Units") := lapply(.SD, zoo::na.locf),
.SDcols = c("Hen", "Units")]
# dummy TimeStamp Hen Units Timing
# 1: 1 2021-08-15 14:11:14 CXU M2 Start
# 2: 1 2021-08-15 14:11:15 CXU M2 <NA>
# 3: 1 2021-08-15 14:11:16 CXU M2 End
# 4: 2 2021-08-15 14:12:40 CXU HM2 Start
# 5: 2 2021-08-15 14:12:41 CXU HM2 End
# 6: 3 2021-08-15 14:12:49 CXU H2 Start
# 7: 103 2021-08-15 14:20:23 CCE HM2 Start
# 8: 103 2021-08-15 14:20:24 CCE HM2 <NA>
# 9: 103 2021-08-15 14:20:25 CCE HM2 <NA>
#10: 103 2021-08-15 14:20:26 CCE HM2 <NA>
#11: 103 2021-08-15 14:20:27 CCE HM2 <NA>
#12: 103 2021-08-15 14:20:28 CCE HM2 <NA>
#13: 103 2021-08-15 14:20:29 CCE HM2 <NA>
#14: 103 2021-08-15 14:20:30 CCE HM2 <NA>
#15: 103 2021-08-15 14:20:31 CCE HM2 <NA>
#16: 103 2021-08-15 14:20:32 CCE HM2 <NA>
#17: 103 2021-08-15 14:20:33 CCE HM2 <NA>
#18: 103 2021-08-15 14:20:34 CCE HM2 <NA>
#19: 103 2021-08-15 14:20:35 CCE HM2 <NA>
#20: 103 2021-08-15 14:20:36 CCE HM2 <NA>
#21: 103 2021-08-15 14:20:37 CCE HM2 <NA>
#22: 103 2021-08-15 14:20:38 CCE HM2 End
#23: 104 2021-08-15 14:20:42 CCE M2 Start
#24: 104 2021-08-15 14:20:43 CCE M2 End
# dummy TimeStamp Hen Units Timing
我有一个 data.table
看起来像这样:
sample <- structure(list(TimeStamp = structure(c(1629036673.5, 1629036676.35,
1629036760.29, 1629036760.95, 1629036769.3, 1629037223.1, 1629037237.81,
1629037241.6, 1629037243.34), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Hen = c("CXU", "CXU", "CXU", "CXU", "CXU", "CCE", "CCE",
"CCE", "CCE"), Units = c("M2", "M2", "HM2", "HM2", "H2",
"HM2", "HM2", "M2", "M2"), dummy = c(1L, 1L, 2L, 2L, 3L,
103L, 103L, 104L, 104L), Timing = c("Start", "End", "Start",
"End", "Start", "Start", "End", "Start", "End")), row.names = c(NA,
-9L), class = c("data.table", "data.frame"))
> sample
TimeStamp Hen Units dummy Timing
1: 2021-08-15 14:11:13 CXU M2 1 Start
2: 2021-08-15 14:11:16 CXU M2 1 End
3: 2021-08-15 14:12:40 CXU HM2 2 Start
4: 2021-08-15 14:12:40 CXU HM2 2 End
5: 2021-08-15 14:12:49 CXU H2 3 Start
6: 2021-08-15 14:20:23 CCE HM2 103 Start
7: 2021-08-15 14:20:37 CCE HM2 103 End
8: 2021-08-15 14:20:41 CCE M2 104 Start
9: 2021-08-15 14:20:43 CCE M2 104 End
我需要的是按开始和结束指示的秒数完成每个时间序列(有时只有一个开始存在,这意味着条目既是开始又是结束)并填充一些变量。我能够使用 complete
和 fill
实现这一点(使用包 tidyr
和 dplyr
):
> sample %>%
+ group_by(dummy) %>%
+ complete(TimeStamp = seq(first(TimeStamp), max(TimeStamp), by = "sec")) %>%
+ fill(Hen, Units, dummy)
# A tibble: 26 x 5
# Groups: dummy [5]
dummy TimeStamp Hen Units Timing
<int> <dttm> <chr> <chr> <chr>
1 1 2021-08-15 14:11:13 CXU M2 Start
2 1 2021-08-15 14:11:14 CXU M2 NA
3 1 2021-08-15 14:11:15 CXU M2 NA
4 1 2021-08-15 14:11:16 CXU M2 End
5 2 2021-08-15 14:12:40 CXU HM2 Start
6 2 2021-08-15 14:12:40 CXU HM2 End
7 3 2021-08-15 14:12:49 CXU H2 Start
8 103 2021-08-15 14:20:23 CCE HM2 Start
9 103 2021-08-15 14:20:24 CCE HM2 NA
10 103 2021-08-15 14:20:25 CCE HM2 NA
# ... with 16 more rows
我的问题是我使用 data.table 是有原因的,我有大量的数据,而我找到的解决方案运行得太慢了。我的想法是通过首先创建一个单独的时间序列并将两者合并来使用更多 data.table 基本方法。然后可以使用 zoo
中的 na.locf
完成填充。但是,我无法以任何有效的方式创建时间序列。有人有什么想法吗?
试试这个方法... 此外,您在时间戳中使用小数,我建议在创建序列之前四舍五入。
library(data.table)
library(zoo)
# you need to round your decimal timestamps to seconds
# use ceiling() or floor() if you prefer
sample[, TimeStamp := as.POSIXct(round(TimeStamp))]
# Create a data.table with min-max sequences
DT <- sample[, .(TimeStamp = seq(min(TimeStamp), max(TimeStamp), by = 1)), by = .(dummy)]
# Perform update join
DT[sample,
`:=`(Hen = i.Hen, Units = i.Units, Timing = i.Timing),
on = .(dummy, TimeStamp)]
# Fill down NA's Hen and Units columns
# data.table's setnafill doens not (yet?) support character columns
# so we use zoo:na.locf()
DT[, c("Hen", "Units") := lapply(.SD, zoo::na.locf),
.SDcols = c("Hen", "Units")]
# dummy TimeStamp Hen Units Timing
# 1: 1 2021-08-15 14:11:14 CXU M2 Start
# 2: 1 2021-08-15 14:11:15 CXU M2 <NA>
# 3: 1 2021-08-15 14:11:16 CXU M2 End
# 4: 2 2021-08-15 14:12:40 CXU HM2 Start
# 5: 2 2021-08-15 14:12:41 CXU HM2 End
# 6: 3 2021-08-15 14:12:49 CXU H2 Start
# 7: 103 2021-08-15 14:20:23 CCE HM2 Start
# 8: 103 2021-08-15 14:20:24 CCE HM2 <NA>
# 9: 103 2021-08-15 14:20:25 CCE HM2 <NA>
#10: 103 2021-08-15 14:20:26 CCE HM2 <NA>
#11: 103 2021-08-15 14:20:27 CCE HM2 <NA>
#12: 103 2021-08-15 14:20:28 CCE HM2 <NA>
#13: 103 2021-08-15 14:20:29 CCE HM2 <NA>
#14: 103 2021-08-15 14:20:30 CCE HM2 <NA>
#15: 103 2021-08-15 14:20:31 CCE HM2 <NA>
#16: 103 2021-08-15 14:20:32 CCE HM2 <NA>
#17: 103 2021-08-15 14:20:33 CCE HM2 <NA>
#18: 103 2021-08-15 14:20:34 CCE HM2 <NA>
#19: 103 2021-08-15 14:20:35 CCE HM2 <NA>
#20: 103 2021-08-15 14:20:36 CCE HM2 <NA>
#21: 103 2021-08-15 14:20:37 CCE HM2 <NA>
#22: 103 2021-08-15 14:20:38 CCE HM2 End
#23: 104 2021-08-15 14:20:42 CCE M2 Start
#24: 104 2021-08-15 14:20:43 CCE M2 End
# dummy TimeStamp Hen Units Timing