使用 dplyr 进行具有自适应范围的按行操作
Rowwise operation with adaptive range using dplyr
,我想计算 colocation(即两个人同时出现)给定智能卡数据的实例。这是一个由十条记录组成的虚构样本:
library(lubridate)
smartcard <- c(1,2,3,2,1,2,4,4,1,1)
boarding_stop <- c("C23", "C14", "C23", "C23", "C23", "C14", "C14", "C23", "C14", "C23")
boarding_time <- as.times(c("07:24:01", "07:26:18", "07:37:19", "08:29:22", "08:34:10", "15:55:23",
"16:20:22", "17:07:31", "17:13:34", "17:35:52"))
colocation <- data.frame(smartcard, boarding_time, boarding_stop)
colocation
smartcard boarding_time boarding_stop
1 1 07:24:01 C23
2 2 07:26:18 C14
3 3 07:37:19 C23
4 2 08:29:22 C23
5 1 08:34:10 C23
6 2 15:55:23 C14
7 4 16:20:22 C14
8 4 17:07:31 C23
9 1 17:13:34 C14
10 1 17:35:52 C23
给定 30 分钟的共置缓冲区(即到达 07:24 的乘客 1 将 与另一位乘客 07:54 之前到达的乘客共置 ,我想记录成对乘客满足这个条件的所有实例,记录boarding_stop
、boarding_time
和他们的smartcard
ID。
例如,我会发现乘客 1 和 3 在 07:37:19 的 C23
并置。最终,我想要
形式的输出
boarding_stop boarding_time smartcard1 smartcard2
C23 07:37:19 1 3
C23 08:34:10 2 1
C23 07:35:52 4 1
C14 16:20:22 2 4
我之前的尝试是通过几个 for
循环来编写代码,这些循环查找单独的旅行信息对,并确定两次旅行是否在半小时间隔内在火车站进行了记录。一旦找到,然后附加一个新行,其中包含有关时间、智能卡乘客和位置的信息。
Output<- read.table(text = "boarding_stop boarding_time smartcard1 smartcard2", header = TRUE)
for s in unique(colocaion$boarding_stop):
for i in 1:nrow(colocation):
for j in 1:nrow(colocation):
if colocation$boarding_time[[j,2]] <= colocation$boarding_time[[i,2]] + "00:30:00" &
colocation$boarding_time[[j,2]] >= colocation$boarding_time[[i,2]]:
Output %>% add_row(boarding_stop = colocation$boarding_stop[[j,3]],
boarding_time = colocation$boarding_time[[j,2]],
smartcard1 = colocation$smartcard[[i,1]],
smartcard2 = colocation$smartcard[[j,1]])
end
end
end
我最初使用 dplyr
的方法涉及 group_by
首先对唯一站点进行分组。但是由于半小时的缓冲时间会随着每对行程而变化,我认为我不能简单地 mutate
和 summarise
来捕获 colocation。感谢@Matt 在 中的回答。对此的任何帮助将不胜感激。
编辑:dplyr
解决方案
#Change to timestamp and create time range
dt <- dt %>%
mutate(boarding_time = parse_date_time(boarding_time,orders = "HMS"),
boardtime_time_plus=boarding_time+hm("00:30"),
boardtime_time_minus=boarding_time-hm("00:30"))
# cartesian join within each boarding_stop and then filter
dt %>%
mutate(fake_col=TRUE) %>%
left_join(dt %>% mutate(fake_col=TRUE),by=c("fake_col","boarding_stop")) %>%
group_by(boarding_stop) %>%
ungroup() %>%
filter(smartcard.x!=smartcard.y,boardtime_time_minus.x<=boarding_time.y,boardtime_time_plus.x>=boarding_time.y) %>%
select(boarding_stop,boarding_time=boarding_time.x,smartcard1=smartcard.x,smartcard2=smartcard.y) %>%
group_by(paste0(boarding_stop,"-",(smartcard1+smartcard2))) %>%
filter(boarding_time==max(boarding_time)) %>%
ungroup() %>%
mutate(boarding_time=format(boarding_time,"%H:%M:%S")) %>%
select(-5)
#> # A tibble: 4 x 4
#> boarding_stop boarding_time smartcard1 smartcard2
#> <chr> <chr> <int> <int>
#> 1 C23 07:37:19 3 1
#> 2 C23 08:34:10 1 2
#> 3 C14 16:20:22 4 2
#> 4 C23 17:35:52 1 4
这是一个data.table
解决方案。我不熟悉 dplyr
所以我想你需要玩 filter
才能做到这一点。
library(data.table)
library(lubridate)
dt <- fread('smartcard boarding_time boarding_stop
1 07:24:01 C23
2 07:26:18 C14
3 07:37:19 C23
2 08:29:22 C23
1 08:34:10 C23
2 15:55:23 C14
4 16:20:22 C14
4 17:07:31 C23
1 17:13:34 C14
1 17:35:52 C23')
#Change to timestamp
dt[,boarding_time:=parse_date_time(boarding_time,orders = "HMS")]
#Create time range
dt[,`:=`(boardtime_time_plus=boarding_time+hm("00:30"),
boardtime_time_minus=boarding_time-hm("00:30"))]
#non equal join and excluding joined on itself
dtd <- dt[dt,on=.(boarding_stop,boardtime_time_minus<=boarding_time,boardtime_time_plus>=boarding_time)][smartcard!=i.smartcard,]
# a bit format and select the max datetime for each combination
# there definitely should have elegant way to do this but i havent figured out
dtd[,.(boarding_stop,boarding_time = format(boarding_time,"%H:%M:%S"),smartcard1=smartcard,smartcard2=i.smartcard)][
dtd[,.I[boarding_time==max(boarding_time)],by=.(paste0(boarding_stop,"-",(smartcard1+smartcard2)))]$V1,]
#> boarding_stop boarding_time smartcard1 smartcard2
#> 1: C23 07:37:19 3 1
#> 2: C23 08:34:10 1 2
#> 3: C14 16:20:22 4 2
#> 4: C23 17:35:52 1 4
由 reprex package (v0.3.0)
于 2020-04-25 创建
library(lubridate)
smartcard <- c(1,2,3,2,1,2,4,4,1,1)
boarding_stop <- c("C23", "C14", "C23", "C23", "C23", "C14", "C14", "C23", "C14", "C23")
boarding_time <- as.times(c("07:24:01", "07:26:18", "07:37:19", "08:29:22", "08:34:10", "15:55:23",
"16:20:22", "17:07:31", "17:13:34", "17:35:52"))
colocation <- data.frame(smartcard, boarding_time, boarding_stop)
colocation
smartcard boarding_time boarding_stop
1 1 07:24:01 C23
2 2 07:26:18 C14
3 3 07:37:19 C23
4 2 08:29:22 C23
5 1 08:34:10 C23
6 2 15:55:23 C14
7 4 16:20:22 C14
8 4 17:07:31 C23
9 1 17:13:34 C14
10 1 17:35:52 C23
给定 30 分钟的共置缓冲区(即到达 07:24 的乘客 1 将 与另一位乘客 07:54 之前到达的乘客共置 ,我想记录成对乘客满足这个条件的所有实例,记录boarding_stop
、boarding_time
和他们的smartcard
ID。
例如,我会发现乘客 1 和 3 在 07:37:19 的 C23
并置。最终,我想要
boarding_stop boarding_time smartcard1 smartcard2
C23 07:37:19 1 3
C23 08:34:10 2 1
C23 07:35:52 4 1
C14 16:20:22 2 4
我之前的尝试是通过几个 for
循环来编写代码,这些循环查找单独的旅行信息对,并确定两次旅行是否在半小时间隔内在火车站进行了记录。一旦找到,然后附加一个新行,其中包含有关时间、智能卡乘客和位置的信息。
Output<- read.table(text = "boarding_stop boarding_time smartcard1 smartcard2", header = TRUE)
for s in unique(colocaion$boarding_stop):
for i in 1:nrow(colocation):
for j in 1:nrow(colocation):
if colocation$boarding_time[[j,2]] <= colocation$boarding_time[[i,2]] + "00:30:00" &
colocation$boarding_time[[j,2]] >= colocation$boarding_time[[i,2]]:
Output %>% add_row(boarding_stop = colocation$boarding_stop[[j,3]],
boarding_time = colocation$boarding_time[[j,2]],
smartcard1 = colocation$smartcard[[i,1]],
smartcard2 = colocation$smartcard[[j,1]])
end
end
end
我最初使用 dplyr
的方法涉及 group_by
首先对唯一站点进行分组。但是由于半小时的缓冲时间会随着每对行程而变化,我认为我不能简单地 mutate
和 summarise
来捕获 colocation。感谢@Matt 在
编辑:dplyr
解决方案
#Change to timestamp and create time range
dt <- dt %>%
mutate(boarding_time = parse_date_time(boarding_time,orders = "HMS"),
boardtime_time_plus=boarding_time+hm("00:30"),
boardtime_time_minus=boarding_time-hm("00:30"))
# cartesian join within each boarding_stop and then filter
dt %>%
mutate(fake_col=TRUE) %>%
left_join(dt %>% mutate(fake_col=TRUE),by=c("fake_col","boarding_stop")) %>%
group_by(boarding_stop) %>%
ungroup() %>%
filter(smartcard.x!=smartcard.y,boardtime_time_minus.x<=boarding_time.y,boardtime_time_plus.x>=boarding_time.y) %>%
select(boarding_stop,boarding_time=boarding_time.x,smartcard1=smartcard.x,smartcard2=smartcard.y) %>%
group_by(paste0(boarding_stop,"-",(smartcard1+smartcard2))) %>%
filter(boarding_time==max(boarding_time)) %>%
ungroup() %>%
mutate(boarding_time=format(boarding_time,"%H:%M:%S")) %>%
select(-5)
#> # A tibble: 4 x 4
#> boarding_stop boarding_time smartcard1 smartcard2
#> <chr> <chr> <int> <int>
#> 1 C23 07:37:19 3 1
#> 2 C23 08:34:10 1 2
#> 3 C14 16:20:22 4 2
#> 4 C23 17:35:52 1 4
这是一个data.table
解决方案。我不熟悉 dplyr
所以我想你需要玩 filter
才能做到这一点。
library(data.table)
library(lubridate)
dt <- fread('smartcard boarding_time boarding_stop
1 07:24:01 C23
2 07:26:18 C14
3 07:37:19 C23
2 08:29:22 C23
1 08:34:10 C23
2 15:55:23 C14
4 16:20:22 C14
4 17:07:31 C23
1 17:13:34 C14
1 17:35:52 C23')
#Change to timestamp
dt[,boarding_time:=parse_date_time(boarding_time,orders = "HMS")]
#Create time range
dt[,`:=`(boardtime_time_plus=boarding_time+hm("00:30"),
boardtime_time_minus=boarding_time-hm("00:30"))]
#non equal join and excluding joined on itself
dtd <- dt[dt,on=.(boarding_stop,boardtime_time_minus<=boarding_time,boardtime_time_plus>=boarding_time)][smartcard!=i.smartcard,]
# a bit format and select the max datetime for each combination
# there definitely should have elegant way to do this but i havent figured out
dtd[,.(boarding_stop,boarding_time = format(boarding_time,"%H:%M:%S"),smartcard1=smartcard,smartcard2=i.smartcard)][
dtd[,.I[boarding_time==max(boarding_time)],by=.(paste0(boarding_stop,"-",(smartcard1+smartcard2)))]$V1,]
#> boarding_stop boarding_time smartcard1 smartcard2
#> 1: C23 07:37:19 3 1
#> 2: C23 08:34:10 1 2
#> 3: C14 16:20:22 4 2
#> 4: C23 17:35:52 1 4
由 reprex package (v0.3.0)
于 2020-04-25 创建