基于 POSIXct 值合并两个数据集

Combine two datasets, based on POSIXct values

我正在努力将两个数据集相互组合。

Dataset1 is containing a "before time" and a "after time", and a "channel".

Dataset2 is containing just one "time" and a "channel" column as well.

我想使用以下逻辑向 Dataset1 添加一个二进制列 (Yes/No): 如果 Dataset2 中有一行,其中通道 == 通道,并且时间在 "before" 和 "after" 时间内,我想要 "YES"。否则 "NO"。

数据1

ID   Channel   before_time   after_time 
1       A1  2019-09-02 20:13:00 2019-09-02 20:33:00
2       B1  2019-09-02 20:03:00 2019-09-02 20:23:00
3       C1  2019-09-02 20:23:00 2019-09-02 20:43:00
4       D1  2019-09-02 20:23:00 2019-09-02 20:43:00

Data2

ID_B     Channel_B    Time_B
Hallo       A1        2019-09-02 20:23:00
Hi          B2        2019-09-02 20:05:00
Hoi         C1        2019-09-02 22:23:00

期望输出

ID   Channel   before_time   after_time                     Available
1       A1  2019-09-02 20:13:00 2019-09-02 20:33:00         Yes  # Channel == Channel, Time between before & after
2       B1  2019-09-02 20:03:00 2019-09-02 20:23:00          No  # Channel != Channel
3       C1  2019-09-02 20:23:00 2019-09-02 20:43:00          No  # Time is not between before & after
4       D1  2019-09-02 20:23:00 2019-09-02 20:43:00          No  # There is no matching data where channel is D1

期望输出 2(注释解决方案)

从第二个数据集 (Data2) 添加额外的列。

ID   Channel   before_time   after_time                     Available   ID_B     
1       A1  2019-09-02 20:13:00 2019-09-02 20:33:00          Yes        Hallo       
2       B1  2019-09-02 20:03:00 2019-09-02 20:23:00          No         x 
3       C1  2019-09-02 20:23:00 2019-09-02 20:43:00          No         x
4       D1  2019-09-02 20:23:00 2019-09-02 20:43:00          No         x

可重现示例(数据):

ID <- c("1", "2", "3", "4")
channel <- c("A1", "B1", "C1", "D1)
#startdate <- as.POSIXct(c("2019-09-02 20:23:00", "2019-09-02 20:13:00", "2019-09-02 20:33:00", "2019-09-02 20:33:00"))
before_time <- as.POSIXct(c("2019-09-02 20:13:00", "2019-09-02 20:03:00", "2019-09-02 20:23:00", "2019-09-02 20:23:00"))
after_time  <- as.POSIXct(c("2019-09-02 20:33:00", "2019-09-02 20:23:00", "2019-09-02 20:43:00","2019-09-02 20:43:00"))
data1 <- data.frame(ID, channel,   before_time, after_time)
View(data1)


ID_B <- c("Hallo", "Hi", "Hoi")
channel_B <- c("A1", "B2", "C1")
Time_B <- as.POSIXct(c("2019-09-02 20:23:00", "2019-09-02 20:05:00", "2019-09-02 22:23:00"))
data2 <- data.frame(ID_B, channel_B, Time_B)
View(data2)

这个解决方案应该有效:

df<-(cbind(data1,data2)

df<-df%>%mutate(ID=as.integer(ID),
        channel=as.character(channel),
        ID_B=as.character(ID_B),
        channel_B=as.character(channel_B))

df%>%
  mutate(available= ifelse(channel==channel_B & Time_B >= before_time & Time_B <= after_time, "yes","no"))%>%
  select(-ID_B,Time_B,-channel_B)


 # A tibble: 3 x 5
 ID channel before_time         after_time          available
 <int> <chr>   <dttm>              <dttm>              <chr>    
 1     1 A1      2019-09-02 20:13:00 2019-09-02 20:33:00 yes      
 2     2 B1      2019-09-02 20:03:00 2019-09-02 20:23:00 no       
 3     3 C1      2019-09-02 20:23:00 2019-09-02 20:43:00 no  

正如 arg0naut91 所提到的,这里是 data.table 中的非等值连接:

library(data.table)
setDT(data1)
setDT(data2)
data1[, c("Available", "ID_B") :=
        data2[.SD, on=.(channel_B=channel, Time_B>=before_time, Time_B<=after_time), 
            by=.EACHI, .(.N > 0, ID_B)][, (1L:3L) := NULL]
    ]

输出:

   ID channel         before_time          after_time Available  ID_B
1:  1      A1 2019-09-02 20:13:00 2019-09-02 20:33:00      TRUE Hallo
2:  2      B1 2019-09-02 20:03:00 2019-09-02 20:23:00     FALSE  <NA>
3:  3      C1 2019-09-02 20:23:00 2019-09-02 20:43:00     FALSE  <NA>
4:  4      D1 2019-09-02 20:23:00 2019-09-02 20:43:00     FALSE  <NA>

这是一个基本的 R 解决方案,使用 merge + ifelse,即

dfout <- subset(within(merge(data1,data2[-1],by.x = "channel",by.y = "channel_B",all.x = T), 
                       Available <- ifelse(!is.na(Time_B)& Time_B>=before_time & Time_B<=after_time,"Yes","No")),
                select = -Time_B)

这样

> dfout
  channel ID         before_time          after_time Available
1      A1  1 2019-09-02 20:13:00 2019-09-02 20:33:00       Yes
2      B1  2 2019-09-02 20:03:00 2019-09-02 20:23:00        No
3      C1  3 2019-09-02 20:23:00 2019-09-02 20:43:00        No
4      D1  4 2019-09-02 20:23:00 2019-09-02 20:43:00        No

查看 sqldf 也可能有意义,例如:

library(sqldf)

sqldf('SELECT t1.ID, t1.channel, 
      t1.before_time, t1.after_time, 
      CASE WHEN t2.ID_B IS NULL THEN "No" ELSE "Yes" END Available 
      FROM data1 t1 LEFT JOIN data2 t2 ON t1.channel = t2.channel_B
      AND t2.Time_B BETWEEN t1.before_time AND t1.after_time')

输出:

  ID channel         before_time          after_time Available
1  1      A1 2019-09-02 20:13:00 2019-09-02 20:33:00       Yes
2  2      B1 2019-09-02 20:03:00 2019-09-02 20:23:00        No
3  3      C1 2019-09-02 20:23:00 2019-09-02 20:43:00        No
4  4      D1 2019-09-02 20:23:00 2019-09-02 20:43:00        No

这也是 dplyr 中的一种方法:

library(dplyr)

data1 %>%
  left_join(data2,
            by = c('channel' = 'channel_B')
            ) %>%
  mutate(
    Available = ifelse(
      !is.na(Time_B) & Time_B >= before_time & Time_B <= after_time, 'Yes', 'No')
    ) %>%
  select(-ends_with('_B'))

添加附加列:

# sqldf

sqldf('SELECT t1.ID, t1.channel, 
      t1.before_time, t1.after_time, 
      CASE WHEN t2.ID_B IS NULL THEN "No" ELSE "Yes" END Available,
      CASE WHEN t2.ID_B IS NULL THEN "x" ELSE t2.ID_B END ID_B
      FROM data1 t1 LEFT JOIN data2 t2 ON t1.channel = t2.channel_B
      AND t2.Time_B BETWEEN t1.before_time AND t1.after_time')

# dplyr

data1 %>%
  left_join(data2,
            by = c('channel' = 'channel_B')
  ) %>%
  mutate(
    Available = ifelse(
      !is.na(Time_B) & Time_B >= before_time & Time_B <= after_time, 'Yes', 'No'),
    ID_B = ifelse(
      Available == 'Yes', as.character(ID_B), 'x')
  ) %>%
  select(-Time_B)