为负事件的缺失时间测量值插入行
Insert rows for missing time measurements of a negative event
我有一个实验的数据,其中 Subject
s 对一个名为 f
的 Event
进行了评级:
df <- structure(list(Subject = c("A", "A", "A", "B", "B", "B"),
Timestamp = c("00:00:00.146 - 00:00:00.889",
"00:00:01.568 - 00:00:02.183", "00:00:03.642 - 00:00:04.522",
"00:00:00.000 - 00:00:00.660", "00:00:01.247 - 00:00:02.229",
"00:00:03.697 - 00:00:04.926"),
Starttime_ms = c(146, 1568, 3642, 0, 1247, 3697),
Endtime_ms = c(889, 2183, 4522, 660, 2229, 4926),
Duration = c(743, 615, 880, 660, 982, 1229),
Event = c("f", "f", "f", "f", "f", "f")), row.names = c(NA, 6L), class = "data.frame")
缺少的是那些中间时间跨度的相应测量值,其中Subject
没有没有 评价事件。那也是一个事件,只是一个消极的事件;我们称它为 nf
。我如何为那些缺失的时间跨度插入行以获得 完整 记录,包括正面和负面事件 f
和 nf
,如下所示?
预期:
Subject Timestamp Starttime_ms Endtime_ms Duration Event
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 B 00:00:00.000 - 00:00:00.660 0 660 660 f
8 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
9 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
11 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
我已经尝试过 data.table
,但由于我对此并不熟练,所以这次尝试只取得了部分成功:
unq <- c(0, sort(unique(setDT(df)[, c(Starttime_ms, Endtime_ms)])))
df[.(unq[-length(unq)], unq[-1]), on=c("Starttime_ms", "Endtime_ms")]
虽然我对任何解决方案都持开放态度,但我更喜欢 dplyr
一个。
你快到了。问题是为每个组分别完成时间序列。这是会做的:
library(data.table)
# create sequence of complete periods for each Subject
unq <- setDT(df)[, {
tmp <- sort(unique(c(0, Starttime_ms, Endtime_ms)))
list(Starttime_ms = head(tmp, -1L),
Endtime_ms = tail(tmp, -1L))
}, by = Subject]
# join and complete missing values
fmt <- function(ms) format(as.POSIXct(ms / 1000, tz = "UTC", origin = "2000-01-01"), "%H:%M:%OS3")
df[unq, on = .(Subject, Starttime_ms, Endtime_ms)][
is.na(Event), c("Timestamp", "Duration", "Event") := .(
paste(fmt(Starttime_ms), fmt(Endtime_ms), sep = " - "),
Endtime_ms - Starttime_ms,
"nf")][]
Subject Timestamp Starttime_ms Endtime_ms Duration Event
1: A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2: A 00:00:00.146 - 00:00:00.889 146 889 743 f
3: A 00:00:00.889 - 00:00:01.567 889 1568 679 nf
4: A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5: A 00:00:02.182 - 00:00:03.641 2183 3642 1459 nf
6: A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7: B 00:00:00.000 - 00:00:00.660 0 660 660 f
8: B 00:00:00.659 - 00:00:01.246 660 1247 587 nf
9: B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10: B 00:00:02.228 - 00:00:03.697 2229 3697 1468 nf
11: B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
unq
包含每个要加入的主题的所有期间:
unq
Subject Starttime_ms Endtime_ms
1: A 0 146
2: A 146 889
3: A 889 1568
4: A 1568 2183
5: A 2183 3642
6: A 3642 4522
7: B 0 660
8: B 660 1247
9: B 1247 2229
10: B 2229 3697
11: B 3697 4926
这是一个tidyverse
解决方案。
library(tidyverse)
library(lubridate)
df %>%
separate(Timestamp, c("start", "end"), sep = " - ", remove = FALSE) %>%
group_by(Subject) %>%
mutate(Starttime_ms = lag(end, default = "00:00:00.000"),
Endtime_ms = start,
Event = "nf",
Timestamp = paste(Starttime_ms, Endtime_ms, sep = " - "),
across(ends_with("_ms"), ~ hms(.x) * 1000),
Duration = Endtime_ms - Starttime_ms,
across(where(is.period), as.numeric), .keep = "unused") %>%
bind_rows(df) %>%
arrange(Starttime_ms, .by_group = TRUE) %>%
filter(Duration > 0) %>%
ungroup()
输出
# A tibble: 11 × 6
Subject Timestamp Starttime_ms Endtime_ms Duration Event
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 B 00:00:00.000 - 00:00:00.660 0 660 660 f
8 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
9 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
11 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
这正是想要的输出:
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
separate_rows(Timestamp, sep = " - ") %>%
mutate(Timestamp1 = lag(Timestamp, default = "00:00:00.000"), .before=Timestamp) %>%
mutate(Starttime_ms = as.numeric(hms(Timestamp1))*1000,
Endtime_ms = as.numeric(hms(Timestamp))*1000,
Duration = Endtime_ms-Starttime_ms) %>%
mutate(Timestamp = paste(Timestamp1, Timestamp, sep = " - ")) %>%
left_join(df, by="Timestamp") %>%
mutate(Event.y = replace_na(Event.y, "nf")) %>%
select(1, 3:6, Event.y) %>%
filter(Endtime_ms.x !=0) %>%
rename_with(., ~str_replace(., '\.\w', ''))
Subject Timestamp Starttime_ms Endtime_ms Duration Event
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 B 00:00:00.000 - 00:00:00.660 0 660 660 f
8 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
9 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
11 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
我有一个实验的数据,其中 Subject
s 对一个名为 f
的 Event
进行了评级:
df <- structure(list(Subject = c("A", "A", "A", "B", "B", "B"),
Timestamp = c("00:00:00.146 - 00:00:00.889",
"00:00:01.568 - 00:00:02.183", "00:00:03.642 - 00:00:04.522",
"00:00:00.000 - 00:00:00.660", "00:00:01.247 - 00:00:02.229",
"00:00:03.697 - 00:00:04.926"),
Starttime_ms = c(146, 1568, 3642, 0, 1247, 3697),
Endtime_ms = c(889, 2183, 4522, 660, 2229, 4926),
Duration = c(743, 615, 880, 660, 982, 1229),
Event = c("f", "f", "f", "f", "f", "f")), row.names = c(NA, 6L), class = "data.frame")
缺少的是那些中间时间跨度的相应测量值,其中Subject
没有没有 评价事件。那也是一个事件,只是一个消极的事件;我们称它为 nf
。我如何为那些缺失的时间跨度插入行以获得 完整 记录,包括正面和负面事件 f
和 nf
,如下所示?
预期:
Subject Timestamp Starttime_ms Endtime_ms Duration Event
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 B 00:00:00.000 - 00:00:00.660 0 660 660 f
8 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
9 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
11 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
我已经尝试过 data.table
,但由于我对此并不熟练,所以这次尝试只取得了部分成功:
unq <- c(0, sort(unique(setDT(df)[, c(Starttime_ms, Endtime_ms)])))
df[.(unq[-length(unq)], unq[-1]), on=c("Starttime_ms", "Endtime_ms")]
虽然我对任何解决方案都持开放态度,但我更喜欢 dplyr
一个。
你快到了。问题是为每个组分别完成时间序列。这是会做的:
library(data.table)
# create sequence of complete periods for each Subject
unq <- setDT(df)[, {
tmp <- sort(unique(c(0, Starttime_ms, Endtime_ms)))
list(Starttime_ms = head(tmp, -1L),
Endtime_ms = tail(tmp, -1L))
}, by = Subject]
# join and complete missing values
fmt <- function(ms) format(as.POSIXct(ms / 1000, tz = "UTC", origin = "2000-01-01"), "%H:%M:%OS3")
df[unq, on = .(Subject, Starttime_ms, Endtime_ms)][
is.na(Event), c("Timestamp", "Duration", "Event") := .(
paste(fmt(Starttime_ms), fmt(Endtime_ms), sep = " - "),
Endtime_ms - Starttime_ms,
"nf")][]
Subject Timestamp Starttime_ms Endtime_ms Duration Event 1: A 00:00:00.000 - 00:00:00.146 0 146 146 nf 2: A 00:00:00.146 - 00:00:00.889 146 889 743 f 3: A 00:00:00.889 - 00:00:01.567 889 1568 679 nf 4: A 00:00:01.568 - 00:00:02.183 1568 2183 615 f 5: A 00:00:02.182 - 00:00:03.641 2183 3642 1459 nf 6: A 00:00:03.642 - 00:00:04.522 3642 4522 880 f 7: B 00:00:00.000 - 00:00:00.660 0 660 660 f 8: B 00:00:00.659 - 00:00:01.246 660 1247 587 nf 9: B 00:00:01.247 - 00:00:02.229 1247 2229 982 f 10: B 00:00:02.228 - 00:00:03.697 2229 3697 1468 nf 11: B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
unq
包含每个要加入的主题的所有期间:
unq
Subject Starttime_ms Endtime_ms 1: A 0 146 2: A 146 889 3: A 889 1568 4: A 1568 2183 5: A 2183 3642 6: A 3642 4522 7: B 0 660 8: B 660 1247 9: B 1247 2229 10: B 2229 3697 11: B 3697 4926
这是一个tidyverse
解决方案。
library(tidyverse)
library(lubridate)
df %>%
separate(Timestamp, c("start", "end"), sep = " - ", remove = FALSE) %>%
group_by(Subject) %>%
mutate(Starttime_ms = lag(end, default = "00:00:00.000"),
Endtime_ms = start,
Event = "nf",
Timestamp = paste(Starttime_ms, Endtime_ms, sep = " - "),
across(ends_with("_ms"), ~ hms(.x) * 1000),
Duration = Endtime_ms - Starttime_ms,
across(where(is.period), as.numeric), .keep = "unused") %>%
bind_rows(df) %>%
arrange(Starttime_ms, .by_group = TRUE) %>%
filter(Duration > 0) %>%
ungroup()
输出
# A tibble: 11 × 6
Subject Timestamp Starttime_ms Endtime_ms Duration Event
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 B 00:00:00.000 - 00:00:00.660 0 660 660 f
8 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
9 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
11 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
这正是想要的输出:
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
separate_rows(Timestamp, sep = " - ") %>%
mutate(Timestamp1 = lag(Timestamp, default = "00:00:00.000"), .before=Timestamp) %>%
mutate(Starttime_ms = as.numeric(hms(Timestamp1))*1000,
Endtime_ms = as.numeric(hms(Timestamp))*1000,
Duration = Endtime_ms-Starttime_ms) %>%
mutate(Timestamp = paste(Timestamp1, Timestamp, sep = " - ")) %>%
left_join(df, by="Timestamp") %>%
mutate(Event.y = replace_na(Event.y, "nf")) %>%
select(1, 3:6, Event.y) %>%
filter(Endtime_ms.x !=0) %>%
rename_with(., ~str_replace(., '\.\w', ''))
Subject Timestamp Starttime_ms Endtime_ms Duration Event
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 B 00:00:00.000 - 00:00:00.660 0 660 660 f
8 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
9 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
10 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
11 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f