在 R 中跨共享持续时间提取共同异常
Extracting co-anomalies across shared time durations in R
我需要从已经包含单变量异常的数据框中提取协同异常。
# Libraries
library(dplyr)
library(lubridate)
library(stringr)
# Create input dataframe
DF <- data.frame(
rowID = as.factor(c(1,2,3,4,5,6,7,8)),
Start = as_datetime(c('2022-01-01 09:00:00', '2022-01-01 12:00:00', '2022-01-02 15:00:00',
'2022-01-02 23:30:00', '2022-01-03 00:10:00', '2022-01-29 00:10:00',
'2023-12-25 06:00:00', '2023-12-25 08:00:00')),
Finish = as_datetime(c('2022-01-01 11:00:00', '2022-01-01 15:00:00','2022-01-03 01:00:00',
'2022-01-02 23:50:00', '2022-01-03 03:00:00', '2022-01-31 03:00:00',
'2023-12-25 11:00:00', '2023-12-25 12:00:00')),
Process = c('Process1', 'Process2', 'Process1', 'Process2', 'Process3', 'Process3', 'Process3', 'Process3'),
Anomaly = c('Y','N','Y','Y','Y', 'Y', 'Y', 'Y')
) %>%
arrange(Start, Process) %>%
mutate(Interval = interval(Start, Finish)) %>%
as_tibble()
我能够成功标记与感兴趣的过程 (Process3) 在相似时间段内发生的共同异常。
# Declare process of interest
c <- 'Process3'
# Extract co-anomalies within and between Process3
Result <- DF %>%
filter(int_overlaps(Interval, Interval[Process == c]) == TRUE) %>%
mutate(coAnomaly = ifelse(Anomaly == 'Y', 'Y', 'N')) %>%
left_join(DF, ., by = c('rowID' = 'rowID')) %>%
select(contains('.x'), coAnomaly) %>%
rename_with(~str_remove(., '.x'))
代码正确地标记了进程 3 和其他进程之间的共同异常。虽然它在检测进程 3 针对自身时出错。
第 6 行是一个错误,异常不会在 另一个 Process3 或 任何其他进程之间同时发生。
我正在尝试正确标记:
- 哪些进程与其他进程同时发生(在 LHS 之间)
- 哪些其他进程与 Process3 共同发生(在 RHS 之间)
- 哪些 Process3 与 Process3 共同发生(Within)
您可以使用 rowwise()
尝试此方法:
left_join(DF, DF %>%
rowwise() %>%
filter(any(int_overlaps(Interval, DF$Interval[which(DF$rowID!=rowID & DF$Process == c)]))) %>%
mutate(coAnomaly = ifelse(Anomaly == 'Y', 'Y', 'N')) %>%
select(rowID, coAnomaly)
)
输出:
rowID Start Finish Process Anomaly Interval coAnomaly
<fct> <dttm> <dttm> <chr> <chr> <Interval> <chr>
1 1 2022-01-01 09:00:00 2022-01-01 11:00:00 Process1 Y 2022-01-01 09:00:00 UTC--2022-01-01 11:00:00 UTC NA
2 2 2022-01-01 12:00:00 2022-01-01 15:00:00 Process2 N 2022-01-01 12:00:00 UTC--2022-01-01 15:00:00 UTC NA
3 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC Y
4 4 2022-01-02 23:30:00 2022-01-02 23:50:00 Process2 Y 2022-01-02 23:30:00 UTC--2022-01-02 23:50:00 UTC NA
5 5 2022-01-03 00:10:00 2022-01-03 03:00:00 Process3 Y 2022-01-03 00:10:00 UTC--2022-01-03 03:00:00 UTC NA
6 6 2022-01-29 00:10:00 2022-01-31 03:00:00 Process3 Y 2022-01-29 00:10:00 UTC--2022-01-31 03:00:00 UTC NA
已更新,鉴于 OP 的额外请求分离 Between/Within 和新框架:
rbind(
DF %>%
filter(Process==c) %>%
rowwise() %>%
filter(any(int_overlaps(Interval, DF$Interval[which(DF$rowID!=rowID & DF$Process == c)]))) %>%
mutate(coAnomaly = "within"),
DF %>%
filter(Process!=c) %>%
rowwise() %>%
filter(any(int_overlaps(Interval, DF$Interval[which(DF$rowID!=rowID & DF$Process == c)]))) %>%
mutate(coAnomaly = "between")
)
输出:
rowID Start Finish Process Anomaly Interval coAnomaly
<fct> <dttm> <dttm> <chr> <chr> <Interval> <chr>
1 7 2023-12-25 06:00:00 2023-12-25 11:00:00 Process3 Y 2023-12-25 06:00:00 UTC--2023-12-25 11:00:00 UTC within
2 8 2023-12-25 08:00:00 2023-12-25 12:00:00 Process3 Y 2023-12-25 08:00:00 UTC--2023-12-25 12:00:00 UTC within
3 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC between
所有类型的重叠:
这是另一种方法,它不依赖于指示 Process
兴趣(即不需要 c="Process3"
。
- 创建一个采用间隔的小函数,id 和 returns 重叠 ID (oID) 和重叠进程 (oProcess) 的小标题
get_overlap_IDs = function(interval,id) {
DF %>% filter(int_overlaps(interval, DF$Interval)) %>%
filter(rowID!=id) %>%
select(oID=rowID, oProcess=Process)
}
- 应用函数
rowwise
和unnest
DF %>%
rowwise() %>%
mutate(keys = list(get_overlap_IDs(Interval, rowID))) %>%
unnest(keys)
输出:
rowID Start Finish Process Anomaly Interval oID oProcess
<fct> <dttm> <dttm> <chr> <chr> <Interval> <fct> <chr>
1 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC 4 Process2
2 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC 5 Process3
3 4 2022-01-02 23:30:00 2022-01-02 23:50:00 Process2 Y 2022-01-02 23:30:00 UTC--2022-01-02 23:50:00 UTC 3 Process1
4 5 2022-01-03 00:10:00 2022-01-03 03:00:00 Process3 Y 2022-01-03 00:10:00 UTC--2022-01-03 03:00:00 UTC 3 Process1
5 7 2023-12-25 06:00:00 2023-12-25 11:00:00 Process3 Y 2023-12-25 06:00:00 UTC--2023-12-25 11:00:00 UTC 8 Process3
6 8 2023-12-25 08:00:00 2023-12-25 12:00:00 Process3 Y 2023-12-25 08:00:00 UTC--2023-12-25 12:00:00 UTC 7 Process3
我需要从已经包含单变量异常的数据框中提取协同异常。
# Libraries
library(dplyr)
library(lubridate)
library(stringr)
# Create input dataframe
DF <- data.frame(
rowID = as.factor(c(1,2,3,4,5,6,7,8)),
Start = as_datetime(c('2022-01-01 09:00:00', '2022-01-01 12:00:00', '2022-01-02 15:00:00',
'2022-01-02 23:30:00', '2022-01-03 00:10:00', '2022-01-29 00:10:00',
'2023-12-25 06:00:00', '2023-12-25 08:00:00')),
Finish = as_datetime(c('2022-01-01 11:00:00', '2022-01-01 15:00:00','2022-01-03 01:00:00',
'2022-01-02 23:50:00', '2022-01-03 03:00:00', '2022-01-31 03:00:00',
'2023-12-25 11:00:00', '2023-12-25 12:00:00')),
Process = c('Process1', 'Process2', 'Process1', 'Process2', 'Process3', 'Process3', 'Process3', 'Process3'),
Anomaly = c('Y','N','Y','Y','Y', 'Y', 'Y', 'Y')
) %>%
arrange(Start, Process) %>%
mutate(Interval = interval(Start, Finish)) %>%
as_tibble()
我能够成功标记与感兴趣的过程 (Process3) 在相似时间段内发生的共同异常。
# Declare process of interest
c <- 'Process3'
# Extract co-anomalies within and between Process3
Result <- DF %>%
filter(int_overlaps(Interval, Interval[Process == c]) == TRUE) %>%
mutate(coAnomaly = ifelse(Anomaly == 'Y', 'Y', 'N')) %>%
left_join(DF, ., by = c('rowID' = 'rowID')) %>%
select(contains('.x'), coAnomaly) %>%
rename_with(~str_remove(., '.x'))
代码正确地标记了进程 3 和其他进程之间的共同异常。虽然它在检测进程 3 针对自身时出错。
第 6 行是一个错误,异常不会在 另一个 Process3 或 任何其他进程之间同时发生。
我正在尝试正确标记:
- 哪些进程与其他进程同时发生(在 LHS 之间)
- 哪些其他进程与 Process3 共同发生(在 RHS 之间)
- 哪些 Process3 与 Process3 共同发生(Within)
您可以使用 rowwise()
尝试此方法:
left_join(DF, DF %>%
rowwise() %>%
filter(any(int_overlaps(Interval, DF$Interval[which(DF$rowID!=rowID & DF$Process == c)]))) %>%
mutate(coAnomaly = ifelse(Anomaly == 'Y', 'Y', 'N')) %>%
select(rowID, coAnomaly)
)
输出:
rowID Start Finish Process Anomaly Interval coAnomaly
<fct> <dttm> <dttm> <chr> <chr> <Interval> <chr>
1 1 2022-01-01 09:00:00 2022-01-01 11:00:00 Process1 Y 2022-01-01 09:00:00 UTC--2022-01-01 11:00:00 UTC NA
2 2 2022-01-01 12:00:00 2022-01-01 15:00:00 Process2 N 2022-01-01 12:00:00 UTC--2022-01-01 15:00:00 UTC NA
3 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC Y
4 4 2022-01-02 23:30:00 2022-01-02 23:50:00 Process2 Y 2022-01-02 23:30:00 UTC--2022-01-02 23:50:00 UTC NA
5 5 2022-01-03 00:10:00 2022-01-03 03:00:00 Process3 Y 2022-01-03 00:10:00 UTC--2022-01-03 03:00:00 UTC NA
6 6 2022-01-29 00:10:00 2022-01-31 03:00:00 Process3 Y 2022-01-29 00:10:00 UTC--2022-01-31 03:00:00 UTC NA
已更新,鉴于 OP 的额外请求分离 Between/Within 和新框架:
rbind(
DF %>%
filter(Process==c) %>%
rowwise() %>%
filter(any(int_overlaps(Interval, DF$Interval[which(DF$rowID!=rowID & DF$Process == c)]))) %>%
mutate(coAnomaly = "within"),
DF %>%
filter(Process!=c) %>%
rowwise() %>%
filter(any(int_overlaps(Interval, DF$Interval[which(DF$rowID!=rowID & DF$Process == c)]))) %>%
mutate(coAnomaly = "between")
)
输出:
rowID Start Finish Process Anomaly Interval coAnomaly
<fct> <dttm> <dttm> <chr> <chr> <Interval> <chr>
1 7 2023-12-25 06:00:00 2023-12-25 11:00:00 Process3 Y 2023-12-25 06:00:00 UTC--2023-12-25 11:00:00 UTC within
2 8 2023-12-25 08:00:00 2023-12-25 12:00:00 Process3 Y 2023-12-25 08:00:00 UTC--2023-12-25 12:00:00 UTC within
3 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC between
所有类型的重叠:
这是另一种方法,它不依赖于指示 Process
兴趣(即不需要 c="Process3"
。
- 创建一个采用间隔的小函数,id 和 returns 重叠 ID (oID) 和重叠进程 (oProcess) 的小标题
get_overlap_IDs = function(interval,id) {
DF %>% filter(int_overlaps(interval, DF$Interval)) %>%
filter(rowID!=id) %>%
select(oID=rowID, oProcess=Process)
}
- 应用函数
rowwise
和unnest
DF %>%
rowwise() %>%
mutate(keys = list(get_overlap_IDs(Interval, rowID))) %>%
unnest(keys)
输出:
rowID Start Finish Process Anomaly Interval oID oProcess
<fct> <dttm> <dttm> <chr> <chr> <Interval> <fct> <chr>
1 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC 4 Process2
2 3 2022-01-02 15:00:00 2022-01-03 01:00:00 Process1 Y 2022-01-02 15:00:00 UTC--2022-01-03 01:00:00 UTC 5 Process3
3 4 2022-01-02 23:30:00 2022-01-02 23:50:00 Process2 Y 2022-01-02 23:30:00 UTC--2022-01-02 23:50:00 UTC 3 Process1
4 5 2022-01-03 00:10:00 2022-01-03 03:00:00 Process3 Y 2022-01-03 00:10:00 UTC--2022-01-03 03:00:00 UTC 3 Process1
5 7 2023-12-25 06:00:00 2023-12-25 11:00:00 Process3 Y 2023-12-25 06:00:00 UTC--2023-12-25 11:00:00 UTC 8 Process3
6 8 2023-12-25 08:00:00 2023-12-25 12:00:00 Process3 Y 2023-12-25 08:00:00 UTC--2023-12-25 12:00:00 UTC 7 Process3