基于 POSIXct 值合并两个数据集
Combine two datasets, based on POSIXct values
我正在努力将两个数据集相互组合。
Dataset1 is containing a "before time" and a "after time", and a "channel".
Dataset2 is containing just one "time" and a "channel" column as well.
我想使用以下逻辑向 Dataset1 添加一个二进制列 (Yes/No):
如果 Dataset2 中有一行,其中通道 == 通道,并且时间在 "before" 和 "after" 时间内,我想要 "YES"。否则 "NO"。
数据1
ID Channel before_time after_time
1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00
2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00
3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00
4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00
Data2
ID_B Channel_B Time_B
Hallo A1 2019-09-02 20:23:00
Hi B2 2019-09-02 20:05:00
Hoi C1 2019-09-02 22:23:00
期望输出
ID Channel before_time after_time Available
1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes # Channel == Channel, Time between before & after
2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 No # Channel != Channel
3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 No # Time is not between before & after
4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 No # There is no matching data where channel is D1
期望输出 2(注释解决方案)
从第二个数据集 (Data2) 添加额外的列。
ID Channel before_time after_time Available ID_B
1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes Hallo
2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 No x
3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 No x
4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 No x
可重现示例(数据):
ID <- c("1", "2", "3", "4")
channel <- c("A1", "B1", "C1", "D1)
#startdate <- as.POSIXct(c("2019-09-02 20:23:00", "2019-09-02 20:13:00", "2019-09-02 20:33:00", "2019-09-02 20:33:00"))
before_time <- as.POSIXct(c("2019-09-02 20:13:00", "2019-09-02 20:03:00", "2019-09-02 20:23:00", "2019-09-02 20:23:00"))
after_time <- as.POSIXct(c("2019-09-02 20:33:00", "2019-09-02 20:23:00", "2019-09-02 20:43:00","2019-09-02 20:43:00"))
data1 <- data.frame(ID, channel, before_time, after_time)
View(data1)
ID_B <- c("Hallo", "Hi", "Hoi")
channel_B <- c("A1", "B2", "C1")
Time_B <- as.POSIXct(c("2019-09-02 20:23:00", "2019-09-02 20:05:00", "2019-09-02 22:23:00"))
data2 <- data.frame(ID_B, channel_B, Time_B)
View(data2)
这个解决方案应该有效:
df<-(cbind(data1,data2)
df<-df%>%mutate(ID=as.integer(ID),
channel=as.character(channel),
ID_B=as.character(ID_B),
channel_B=as.character(channel_B))
df%>%
mutate(available= ifelse(channel==channel_B & Time_B >= before_time & Time_B <= after_time, "yes","no"))%>%
select(-ID_B,Time_B,-channel_B)
# A tibble: 3 x 5
ID channel before_time after_time available
<int> <chr> <dttm> <dttm> <chr>
1 1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 yes
2 2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 no
3 3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 no
正如 arg0naut91 所提到的,这里是 data.table
中的非等值连接:
library(data.table)
setDT(data1)
setDT(data2)
data1[, c("Available", "ID_B") :=
data2[.SD, on=.(channel_B=channel, Time_B>=before_time, Time_B<=after_time),
by=.EACHI, .(.N > 0, ID_B)][, (1L:3L) := NULL]
]
输出:
ID channel before_time after_time Available ID_B
1: 1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 TRUE Hallo
2: 2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 FALSE <NA>
3: 3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 FALSE <NA>
4: 4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 FALSE <NA>
这是一个基本的 R 解决方案,使用 merge
+ ifelse
,即
dfout <- subset(within(merge(data1,data2[-1],by.x = "channel",by.y = "channel_B",all.x = T),
Available <- ifelse(!is.na(Time_B)& Time_B>=before_time & Time_B<=after_time,"Yes","No")),
select = -Time_B)
这样
> dfout
channel ID before_time after_time Available
1 A1 1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes
2 B1 2 2019-09-02 20:03:00 2019-09-02 20:23:00 No
3 C1 3 2019-09-02 20:23:00 2019-09-02 20:43:00 No
4 D1 4 2019-09-02 20:23:00 2019-09-02 20:43:00 No
查看 sqldf
也可能有意义,例如:
library(sqldf)
sqldf('SELECT t1.ID, t1.channel,
t1.before_time, t1.after_time,
CASE WHEN t2.ID_B IS NULL THEN "No" ELSE "Yes" END Available
FROM data1 t1 LEFT JOIN data2 t2 ON t1.channel = t2.channel_B
AND t2.Time_B BETWEEN t1.before_time AND t1.after_time')
输出:
ID channel before_time after_time Available
1 1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes
2 2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 No
3 3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 No
4 4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 No
这也是 dplyr
中的一种方法:
library(dplyr)
data1 %>%
left_join(data2,
by = c('channel' = 'channel_B')
) %>%
mutate(
Available = ifelse(
!is.na(Time_B) & Time_B >= before_time & Time_B <= after_time, 'Yes', 'No')
) %>%
select(-ends_with('_B'))
添加附加列:
# sqldf
sqldf('SELECT t1.ID, t1.channel,
t1.before_time, t1.after_time,
CASE WHEN t2.ID_B IS NULL THEN "No" ELSE "Yes" END Available,
CASE WHEN t2.ID_B IS NULL THEN "x" ELSE t2.ID_B END ID_B
FROM data1 t1 LEFT JOIN data2 t2 ON t1.channel = t2.channel_B
AND t2.Time_B BETWEEN t1.before_time AND t1.after_time')
# dplyr
data1 %>%
left_join(data2,
by = c('channel' = 'channel_B')
) %>%
mutate(
Available = ifelse(
!is.na(Time_B) & Time_B >= before_time & Time_B <= after_time, 'Yes', 'No'),
ID_B = ifelse(
Available == 'Yes', as.character(ID_B), 'x')
) %>%
select(-Time_B)
我正在努力将两个数据集相互组合。
Dataset1 is containing a "before time" and a "after time", and a "channel".
Dataset2 is containing just one "time" and a "channel" column as well.
我想使用以下逻辑向 Dataset1 添加一个二进制列 (Yes/No): 如果 Dataset2 中有一行,其中通道 == 通道,并且时间在 "before" 和 "after" 时间内,我想要 "YES"。否则 "NO"。
数据1
ID Channel before_time after_time
1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00
2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00
3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00
4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00
Data2
ID_B Channel_B Time_B
Hallo A1 2019-09-02 20:23:00
Hi B2 2019-09-02 20:05:00
Hoi C1 2019-09-02 22:23:00
期望输出
ID Channel before_time after_time Available
1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes # Channel == Channel, Time between before & after
2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 No # Channel != Channel
3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 No # Time is not between before & after
4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 No # There is no matching data where channel is D1
期望输出 2(注释解决方案)
从第二个数据集 (Data2) 添加额外的列。
ID Channel before_time after_time Available ID_B
1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes Hallo
2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 No x
3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 No x
4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 No x
可重现示例(数据):
ID <- c("1", "2", "3", "4")
channel <- c("A1", "B1", "C1", "D1)
#startdate <- as.POSIXct(c("2019-09-02 20:23:00", "2019-09-02 20:13:00", "2019-09-02 20:33:00", "2019-09-02 20:33:00"))
before_time <- as.POSIXct(c("2019-09-02 20:13:00", "2019-09-02 20:03:00", "2019-09-02 20:23:00", "2019-09-02 20:23:00"))
after_time <- as.POSIXct(c("2019-09-02 20:33:00", "2019-09-02 20:23:00", "2019-09-02 20:43:00","2019-09-02 20:43:00"))
data1 <- data.frame(ID, channel, before_time, after_time)
View(data1)
ID_B <- c("Hallo", "Hi", "Hoi")
channel_B <- c("A1", "B2", "C1")
Time_B <- as.POSIXct(c("2019-09-02 20:23:00", "2019-09-02 20:05:00", "2019-09-02 22:23:00"))
data2 <- data.frame(ID_B, channel_B, Time_B)
View(data2)
这个解决方案应该有效:
df<-(cbind(data1,data2)
df<-df%>%mutate(ID=as.integer(ID),
channel=as.character(channel),
ID_B=as.character(ID_B),
channel_B=as.character(channel_B))
df%>%
mutate(available= ifelse(channel==channel_B & Time_B >= before_time & Time_B <= after_time, "yes","no"))%>%
select(-ID_B,Time_B,-channel_B)
# A tibble: 3 x 5
ID channel before_time after_time available
<int> <chr> <dttm> <dttm> <chr>
1 1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 yes
2 2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 no
3 3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 no
正如 arg0naut91 所提到的,这里是 data.table
中的非等值连接:
library(data.table)
setDT(data1)
setDT(data2)
data1[, c("Available", "ID_B") :=
data2[.SD, on=.(channel_B=channel, Time_B>=before_time, Time_B<=after_time),
by=.EACHI, .(.N > 0, ID_B)][, (1L:3L) := NULL]
]
输出:
ID channel before_time after_time Available ID_B
1: 1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 TRUE Hallo
2: 2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 FALSE <NA>
3: 3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 FALSE <NA>
4: 4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 FALSE <NA>
这是一个基本的 R 解决方案,使用 merge
+ ifelse
,即
dfout <- subset(within(merge(data1,data2[-1],by.x = "channel",by.y = "channel_B",all.x = T),
Available <- ifelse(!is.na(Time_B)& Time_B>=before_time & Time_B<=after_time,"Yes","No")),
select = -Time_B)
这样
> dfout
channel ID before_time after_time Available
1 A1 1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes
2 B1 2 2019-09-02 20:03:00 2019-09-02 20:23:00 No
3 C1 3 2019-09-02 20:23:00 2019-09-02 20:43:00 No
4 D1 4 2019-09-02 20:23:00 2019-09-02 20:43:00 No
查看 sqldf
也可能有意义,例如:
library(sqldf)
sqldf('SELECT t1.ID, t1.channel,
t1.before_time, t1.after_time,
CASE WHEN t2.ID_B IS NULL THEN "No" ELSE "Yes" END Available
FROM data1 t1 LEFT JOIN data2 t2 ON t1.channel = t2.channel_B
AND t2.Time_B BETWEEN t1.before_time AND t1.after_time')
输出:
ID channel before_time after_time Available
1 1 A1 2019-09-02 20:13:00 2019-09-02 20:33:00 Yes
2 2 B1 2019-09-02 20:03:00 2019-09-02 20:23:00 No
3 3 C1 2019-09-02 20:23:00 2019-09-02 20:43:00 No
4 4 D1 2019-09-02 20:23:00 2019-09-02 20:43:00 No
这也是 dplyr
中的一种方法:
library(dplyr)
data1 %>%
left_join(data2,
by = c('channel' = 'channel_B')
) %>%
mutate(
Available = ifelse(
!is.na(Time_B) & Time_B >= before_time & Time_B <= after_time, 'Yes', 'No')
) %>%
select(-ends_with('_B'))
添加附加列:
# sqldf
sqldf('SELECT t1.ID, t1.channel,
t1.before_time, t1.after_time,
CASE WHEN t2.ID_B IS NULL THEN "No" ELSE "Yes" END Available,
CASE WHEN t2.ID_B IS NULL THEN "x" ELSE t2.ID_B END ID_B
FROM data1 t1 LEFT JOIN data2 t2 ON t1.channel = t2.channel_B
AND t2.Time_B BETWEEN t1.before_time AND t1.after_time')
# dplyr
data1 %>%
left_join(data2,
by = c('channel' = 'channel_B')
) %>%
mutate(
Available = ifelse(
!is.na(Time_B) & Time_B >= before_time & Time_B <= after_time, 'Yes', 'No'),
ID_B = ifelse(
Available == 'Yes', as.character(ID_B), 'x')
) %>%
select(-Time_B)