按间隔切割并在 R 中聚合超过一个月
cut by interval and aggregate over one month in R
我有给定的数据 - 2013 年 10 月从特定站点开始的所有自行车旅行。我想计算十分钟时间间隔内发生的旅行次数。总共应该有 144 行,其中包含整个月在该时间间隔内发生的所有行程的总和。如何切割 data.frame 然后按间隔聚合(以便在 00:00:01 和 00:10:00 之间发生的行程计入第二行,在 00:10:01 和 [=19= 之间] 算在第三行,依此类推...)?
head(one.station)
tripduration starttime stoptime start.station.id start.station.name
59 803 2013-10-01 00:11:49 2013-10-01 00:25:12 521 8 Ave & W 31 St
208 445 2013-10-01 00:40:05 2013-10-01 00:47:30 521 8 Ave & W 31 St
359 643 2013-10-01 01:25:57 2013-10-01 01:36:40 521 8 Ave & W 31 St
635 388 2013-10-01 05:30:30 2013-10-01 05:36:58 521 8 Ave & W 31 St
661 314 2013-10-01 05:38:00 2013-10-01 05:43:14 521 8 Ave & W 31 St
768 477 2013-10-01 05:54:49 2013-10-01 06:02:46 521 8 Ave & W 31 St
start.station.latitude start.station.longitude end.station.id end.station.name
59 40.75045 -73.99481 2003 1 Ave & E 18 St
208 40.75045 -73.99481 505 6 Ave & W 33 St
359 40.75045 -73.99481 508 W 46 St & 11 Ave
635 40.75045 -73.99481 459 W 20 St & 11 Ave
661 40.75045 -73.99481 462 W 22 St & 10 Ave
768 40.75045 -73.99481 457 Broadway & W 58 St
end.station.latitude end.station.longitude bikeid usertype birth.year gender
59 40.73416 -73.98024 15139 Subscriber 1985 1
208 40.74901 -73.98848 20538 Subscriber 1990 2
359 40.76341 -73.99667 19935 Customer \N 0
635 40.74674 -74.00776 14781 Subscriber 1955 1
661 40.74692 -74.00452 17976 Subscriber 1982 1
768 40.76695 -73.98169 19022 Subscriber 1973 1
所以输出看起来像这样
output
interval total_trips
1 00:00:00 0
2 00:10:00 1
3 00:20:00 2
4 00:30:00 3
5 00:40:00 4
lubridate
库可以提供一种解决方案。它有一个很好的区间重叠逻辑函数。下面使用 lapply
遍历数据中提供的间隔,然后相应地对它们进行分桶。
library(lubridate)
start_times <- as.POSIXlt(
c("2013-10-01 00:11:49"
,"2013-10-01 00:40:05"
,"2013-10-01 01:25:57"
,"2013-10-01 05:30:30"
,"2013-10-01 05:38:00"
,"2013-10-01 05:54:49")
)
stop_times <- as.POSIXlt(
c("2013-10-01 00:25:12"
,"2013-10-01 00:47:30"
,"2013-10-01 01:36:40"
,"2013-10-01 05:36:58"
,"2013-10-01 05:43:14"
,"2013-10-01 06:02:46")
)
start_bucket <- seq(as.POSIXct("2013-10-01 00:00:00"), as.POSIXct("2013-10-01 06:0:00"), by = 600)
end_bucket <- start_bucket + 600
bucket_interval <- interval(start_bucket, end_bucket)
data_interval <- interval(start_times, stop_times)
int_list <- lapply(data_interval, function(x) ifelse(int_overlaps(x, bucket_interval),1,0))
rides_per_bucket <- rowSums(do.call(cbind, int_list))
out_df <- data.frame(bucket_interval, rides_per_bucket)
out_df
bucket_interval rides_per_bucket
1 2013-10-01 00:00:00 PDT--2013-10-01 00:10:00 PDT 0
2 2013-10-01 00:10:00 PDT--2013-10-01 00:20:00 PDT 1
3 2013-10-01 00:20:00 PDT--2013-10-01 00:30:00 PDT 1
4 2013-10-01 00:30:00 PDT--2013-10-01 00:40:00 PDT 0
5 2013-10-01 00:40:00 PDT--2013-10-01 00:50:00 PDT 1
6 2013-10-01 00:50:00 PDT--2013-10-01 01:00:00 PDT 0
7 2013-10-01 01:00:00 PDT--2013-10-01 01:10:00 PDT 0
8 2013-10-01 01:10:00 PDT--2013-10-01 01:20:00 PDT 0
9 2013-10-01 01:20:00 PDT--2013-10-01 01:30:00 PDT 1
10 2013-10-01 01:30:00 PDT--2013-10-01 01:40:00 PDT 1
11 2013-10-01 01:40:00 PDT--2013-10-01 01:50:00 PDT 0
12 2013-10-01 01:50:00 PDT--2013-10-01 02:00:00 PDT 0
13 2013-10-01 02:00:00 PDT--2013-10-01 02:10:00 PDT 0
14 2013-10-01 02:10:00 PDT--2013-10-01 02:20:00 PDT 0
15 2013-10-01 02:20:00 PDT--2013-10-01 02:30:00 PDT 0
16 2013-10-01 02:30:00 PDT--2013-10-01 02:40:00 PDT 0
17 2013-10-01 02:40:00 PDT--2013-10-01 02:50:00 PDT 0
18 2013-10-01 02:50:00 PDT--2013-10-01 03:00:00 PDT 0
19 2013-10-01 03:00:00 PDT--2013-10-01 03:10:00 PDT 0
20 2013-10-01 03:10:00 PDT--2013-10-01 03:20:00 PDT 0
21 2013-10-01 03:20:00 PDT--2013-10-01 03:30:00 PDT 0
22 2013-10-01 03:30:00 PDT--2013-10-01 03:40:00 PDT 0
23 2013-10-01 03:40:00 PDT--2013-10-01 03:50:00 PDT 0
24 2013-10-01 03:50:00 PDT--2013-10-01 04:00:00 PDT 0
25 2013-10-01 04:00:00 PDT--2013-10-01 04:10:00 PDT 0
26 2013-10-01 04:10:00 PDT--2013-10-01 04:20:00 PDT 0
27 2013-10-01 04:20:00 PDT--2013-10-01 04:30:00 PDT 0
28 2013-10-01 04:30:00 PDT--2013-10-01 04:40:00 PDT 0
29 2013-10-01 04:40:00 PDT--2013-10-01 04:50:00 PDT 0
30 2013-10-01 04:50:00 PDT--2013-10-01 05:00:00 PDT 0
31 2013-10-01 05:00:00 PDT--2013-10-01 05:10:00 PDT 0
32 2013-10-01 05:10:00 PDT--2013-10-01 05:20:00 PDT 0
33 2013-10-01 05:20:00 PDT--2013-10-01 05:30:00 PDT 0
34 2013-10-01 05:30:00 PDT--2013-10-01 05:40:00 PDT 2
35 2013-10-01 05:40:00 PDT--2013-10-01 05:50:00 PDT 1
36 2013-10-01 05:50:00 PDT--2013-10-01 06:00:00 PDT 1
37 2013-10-01 06:00:00 PDT--2013-10-01 06:10:00 PDT 1
此处仅使用开始时间:
library(lubridate)
library(dplyr)
tripduration <- floor(runif(6) * 1000)
start_times <- as.POSIXlt(
c("2013-10-01 00:11:49"
,"2013-10-01 00:40:05"
,"2013-10-01 01:25:57"
,"2013-10-01 05:30:30"
,"2013-10-01 05:38:00"
,"2013-10-01 05:54:49")
)
time_bucket <- start_times - minutes(minute(start_times) %% 10) - seconds(second(start_times))
df <- data.frame(tripduration, start_times, time_bucket)
summarized <- df %>%
group_by(time_bucket) %>%
summarize(trip_count = n())
summarized <- as.data.frame(summarized)
out_buckets <- data.frame(out_buckets = seq(as.POSIXlt("2013-10-01 00:00:00"), as.POSIXct("2013-10-01 06:0:00"), by = 600))
out <- left_join(out_buckets, summarized, by = c("out_buckets" = "time_bucket"))
out$trip_count[is.na(out$trip_count)] <- 0
out
out_buckets trip_count
1 2013-10-01 00:00:00 0
2 2013-10-01 00:10:00 1
3 2013-10-01 00:20:00 0
4 2013-10-01 00:30:00 0
5 2013-10-01 00:40:00 1
6 2013-10-01 00:50:00 0
7 2013-10-01 01:00:00 0
8 2013-10-01 01:10:00 0
9 2013-10-01 01:20:00 1
10 2013-10-01 01:30:00 0
11 2013-10-01 01:40:00 0
12 2013-10-01 01:50:00 0
13 2013-10-01 02:00:00 0
14 2013-10-01 02:10:00 0
15 2013-10-01 02:20:00 0
16 2013-10-01 02:30:00 0
17 2013-10-01 02:40:00 0
18 2013-10-01 02:50:00 0
19 2013-10-01 03:00:00 0
20 2013-10-01 03:10:00 0
21 2013-10-01 03:20:00 0
22 2013-10-01 03:30:00 0
23 2013-10-01 03:40:00 0
24 2013-10-01 03:50:00 0
25 2013-10-01 04:00:00 0
26 2013-10-01 04:10:00 0
27 2013-10-01 04:20:00 0
28 2013-10-01 04:30:00 0
29 2013-10-01 04:40:00 0
30 2013-10-01 04:50:00 0
31 2013-10-01 05:00:00 0
32 2013-10-01 05:10:00 0
33 2013-10-01 05:20:00 0
34 2013-10-01 05:30:00 2
35 2013-10-01 05:40:00 0
36 2013-10-01 05:50:00 1
37 2013-10-01 06:00:00 0
我有给定的数据 - 2013 年 10 月从特定站点开始的所有自行车旅行。我想计算十分钟时间间隔内发生的旅行次数。总共应该有 144 行,其中包含整个月在该时间间隔内发生的所有行程的总和。如何切割 data.frame 然后按间隔聚合(以便在 00:00:01 和 00:10:00 之间发生的行程计入第二行,在 00:10:01 和 [=19= 之间] 算在第三行,依此类推...)?
head(one.station) tripduration starttime stoptime start.station.id start.station.name 59 803 2013-10-01 00:11:49 2013-10-01 00:25:12 521 8 Ave & W 31 St 208 445 2013-10-01 00:40:05 2013-10-01 00:47:30 521 8 Ave & W 31 St 359 643 2013-10-01 01:25:57 2013-10-01 01:36:40 521 8 Ave & W 31 St 635 388 2013-10-01 05:30:30 2013-10-01 05:36:58 521 8 Ave & W 31 St 661 314 2013-10-01 05:38:00 2013-10-01 05:43:14 521 8 Ave & W 31 St 768 477 2013-10-01 05:54:49 2013-10-01 06:02:46 521 8 Ave & W 31 St start.station.latitude start.station.longitude end.station.id end.station.name 59 40.75045 -73.99481 2003 1 Ave & E 18 St 208 40.75045 -73.99481 505 6 Ave & W 33 St 359 40.75045 -73.99481 508 W 46 St & 11 Ave 635 40.75045 -73.99481 459 W 20 St & 11 Ave 661 40.75045 -73.99481 462 W 22 St & 10 Ave 768 40.75045 -73.99481 457 Broadway & W 58 St end.station.latitude end.station.longitude bikeid usertype birth.year gender 59 40.73416 -73.98024 15139 Subscriber 1985 1 208 40.74901 -73.98848 20538 Subscriber 1990 2 359 40.76341 -73.99667 19935 Customer \N 0 635 40.74674 -74.00776 14781 Subscriber 1955 1 661 40.74692 -74.00452 17976 Subscriber 1982 1 768 40.76695 -73.98169 19022 Subscriber 1973 1
所以输出看起来像这样
output interval total_trips 1 00:00:00 0 2 00:10:00 1 3 00:20:00 2 4 00:30:00 3 5 00:40:00 4
lubridate
库可以提供一种解决方案。它有一个很好的区间重叠逻辑函数。下面使用 lapply
遍历数据中提供的间隔,然后相应地对它们进行分桶。
library(lubridate)
start_times <- as.POSIXlt(
c("2013-10-01 00:11:49"
,"2013-10-01 00:40:05"
,"2013-10-01 01:25:57"
,"2013-10-01 05:30:30"
,"2013-10-01 05:38:00"
,"2013-10-01 05:54:49")
)
stop_times <- as.POSIXlt(
c("2013-10-01 00:25:12"
,"2013-10-01 00:47:30"
,"2013-10-01 01:36:40"
,"2013-10-01 05:36:58"
,"2013-10-01 05:43:14"
,"2013-10-01 06:02:46")
)
start_bucket <- seq(as.POSIXct("2013-10-01 00:00:00"), as.POSIXct("2013-10-01 06:0:00"), by = 600)
end_bucket <- start_bucket + 600
bucket_interval <- interval(start_bucket, end_bucket)
data_interval <- interval(start_times, stop_times)
int_list <- lapply(data_interval, function(x) ifelse(int_overlaps(x, bucket_interval),1,0))
rides_per_bucket <- rowSums(do.call(cbind, int_list))
out_df <- data.frame(bucket_interval, rides_per_bucket)
out_df
bucket_interval rides_per_bucket
1 2013-10-01 00:00:00 PDT--2013-10-01 00:10:00 PDT 0
2 2013-10-01 00:10:00 PDT--2013-10-01 00:20:00 PDT 1
3 2013-10-01 00:20:00 PDT--2013-10-01 00:30:00 PDT 1
4 2013-10-01 00:30:00 PDT--2013-10-01 00:40:00 PDT 0
5 2013-10-01 00:40:00 PDT--2013-10-01 00:50:00 PDT 1
6 2013-10-01 00:50:00 PDT--2013-10-01 01:00:00 PDT 0
7 2013-10-01 01:00:00 PDT--2013-10-01 01:10:00 PDT 0
8 2013-10-01 01:10:00 PDT--2013-10-01 01:20:00 PDT 0
9 2013-10-01 01:20:00 PDT--2013-10-01 01:30:00 PDT 1
10 2013-10-01 01:30:00 PDT--2013-10-01 01:40:00 PDT 1
11 2013-10-01 01:40:00 PDT--2013-10-01 01:50:00 PDT 0
12 2013-10-01 01:50:00 PDT--2013-10-01 02:00:00 PDT 0
13 2013-10-01 02:00:00 PDT--2013-10-01 02:10:00 PDT 0
14 2013-10-01 02:10:00 PDT--2013-10-01 02:20:00 PDT 0
15 2013-10-01 02:20:00 PDT--2013-10-01 02:30:00 PDT 0
16 2013-10-01 02:30:00 PDT--2013-10-01 02:40:00 PDT 0
17 2013-10-01 02:40:00 PDT--2013-10-01 02:50:00 PDT 0
18 2013-10-01 02:50:00 PDT--2013-10-01 03:00:00 PDT 0
19 2013-10-01 03:00:00 PDT--2013-10-01 03:10:00 PDT 0
20 2013-10-01 03:10:00 PDT--2013-10-01 03:20:00 PDT 0
21 2013-10-01 03:20:00 PDT--2013-10-01 03:30:00 PDT 0
22 2013-10-01 03:30:00 PDT--2013-10-01 03:40:00 PDT 0
23 2013-10-01 03:40:00 PDT--2013-10-01 03:50:00 PDT 0
24 2013-10-01 03:50:00 PDT--2013-10-01 04:00:00 PDT 0
25 2013-10-01 04:00:00 PDT--2013-10-01 04:10:00 PDT 0
26 2013-10-01 04:10:00 PDT--2013-10-01 04:20:00 PDT 0
27 2013-10-01 04:20:00 PDT--2013-10-01 04:30:00 PDT 0
28 2013-10-01 04:30:00 PDT--2013-10-01 04:40:00 PDT 0
29 2013-10-01 04:40:00 PDT--2013-10-01 04:50:00 PDT 0
30 2013-10-01 04:50:00 PDT--2013-10-01 05:00:00 PDT 0
31 2013-10-01 05:00:00 PDT--2013-10-01 05:10:00 PDT 0
32 2013-10-01 05:10:00 PDT--2013-10-01 05:20:00 PDT 0
33 2013-10-01 05:20:00 PDT--2013-10-01 05:30:00 PDT 0
34 2013-10-01 05:30:00 PDT--2013-10-01 05:40:00 PDT 2
35 2013-10-01 05:40:00 PDT--2013-10-01 05:50:00 PDT 1
36 2013-10-01 05:50:00 PDT--2013-10-01 06:00:00 PDT 1
37 2013-10-01 06:00:00 PDT--2013-10-01 06:10:00 PDT 1
此处仅使用开始时间:
library(lubridate)
library(dplyr)
tripduration <- floor(runif(6) * 1000)
start_times <- as.POSIXlt(
c("2013-10-01 00:11:49"
,"2013-10-01 00:40:05"
,"2013-10-01 01:25:57"
,"2013-10-01 05:30:30"
,"2013-10-01 05:38:00"
,"2013-10-01 05:54:49")
)
time_bucket <- start_times - minutes(minute(start_times) %% 10) - seconds(second(start_times))
df <- data.frame(tripduration, start_times, time_bucket)
summarized <- df %>%
group_by(time_bucket) %>%
summarize(trip_count = n())
summarized <- as.data.frame(summarized)
out_buckets <- data.frame(out_buckets = seq(as.POSIXlt("2013-10-01 00:00:00"), as.POSIXct("2013-10-01 06:0:00"), by = 600))
out <- left_join(out_buckets, summarized, by = c("out_buckets" = "time_bucket"))
out$trip_count[is.na(out$trip_count)] <- 0
out
out_buckets trip_count
1 2013-10-01 00:00:00 0
2 2013-10-01 00:10:00 1
3 2013-10-01 00:20:00 0
4 2013-10-01 00:30:00 0
5 2013-10-01 00:40:00 1
6 2013-10-01 00:50:00 0
7 2013-10-01 01:00:00 0
8 2013-10-01 01:10:00 0
9 2013-10-01 01:20:00 1
10 2013-10-01 01:30:00 0
11 2013-10-01 01:40:00 0
12 2013-10-01 01:50:00 0
13 2013-10-01 02:00:00 0
14 2013-10-01 02:10:00 0
15 2013-10-01 02:20:00 0
16 2013-10-01 02:30:00 0
17 2013-10-01 02:40:00 0
18 2013-10-01 02:50:00 0
19 2013-10-01 03:00:00 0
20 2013-10-01 03:10:00 0
21 2013-10-01 03:20:00 0
22 2013-10-01 03:30:00 0
23 2013-10-01 03:40:00 0
24 2013-10-01 03:50:00 0
25 2013-10-01 04:00:00 0
26 2013-10-01 04:10:00 0
27 2013-10-01 04:20:00 0
28 2013-10-01 04:30:00 0
29 2013-10-01 04:40:00 0
30 2013-10-01 04:50:00 0
31 2013-10-01 05:00:00 0
32 2013-10-01 05:10:00 0
33 2013-10-01 05:20:00 0
34 2013-10-01 05:30:00 2
35 2013-10-01 05:40:00 0
36 2013-10-01 05:50:00 1
37 2013-10-01 06:00:00 0