根据 R 中的日期和小时以 15 分钟的间隔聚合数据
Aggregating data at 15 minutes interval based on date and hour in R
我有数据如下
Time <- c("2021-08-30 7:24","2021-08-30 7:30","2021-08-30 7:54","2021-08-30 8:16","2021-08-30 8:27","2021-08-30 8:22","2021-08-31 2:39","2021-08-31 2:44","2021-08-31 2:50","2021-08-31 2:56","2021-08-31 7:42","2021-08-31 7:45","2021-08-31 7:50","2021-08-31 6:02")
Distance_m <- c(162,162,162,162,162,162,162,157,150,137,122,102,78,42)
df <- data.frame(Time, Distance_m)
df
Time Distance_m
1 2021-08-30 7:24 162
2 2021-08-30 7:30 162
3 2021-08-30 7:54 162
4 2021-08-30 8:16 162
5 2021-08-30 8:27 162
6 2021-08-30 8:22 162
7 2021-08-31 2:39 162
8 2021-08-31 2:44 157
9 2021-08-31 2:50 150
10 2021-08-31 2:56 137
11 2021-08-31 7:42 122
12 2021-08-31 7:45 102
13 2021-08-31 7:50 78
14 2021-08-31 6:02 42
我想根据日期和小时以 15 分钟为间隔对 Distance_m 求和。
我期待如下输出
Date Hour Time Distance_m
2021-08-30 7 54 486
2021-08-30 8 30 486
2021-08-31 2 56 606
2021-08-31 6 2 344
到目前为止我已经试过了
df <- tidyr::separate(df, Time, c("Date", "Time"), sep = " ")
df1<- df %>%
mutate(Time = hm(Time)) %>%
mutate(ttt= (lubridate::minute(Time) + lubridate::hour(Time) * 60)) %>%
mutate(tt = floor(ttt/15) ) %>%
group_by(tt) %>%
summarize(Date = last(Date),Time = last(Time), Distance_m = sum(Distance_m))
但是输出有点乱。我希望在处理大量数据时找到一种有效的方法。
谢谢
虽然没有完全给出您的预期结果,但也许可用。
您可以看看这是否符合您的需求。
library(data.table)
setDT(df)
df[, Time := ymd_hm(Time)]
df[, groups := lubridate::round_date(Time, "15 minutes")]
df[, .(Distance_m_sum = sum(Distance_m)), by = groups]
groups Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:15:00 324
4: 2021-08-30 08:30:00 162
5: 2021-08-31 02:45:00 469
6: 2021-08-31 03:00:00 137
7: 2021-08-31 07:45:00 302
8: 2021-08-31 06:00:00 42
更多扩展示例
我认为你必须定义你的宿舍,lubridate 方法有三个选项,round_date、floor_date 和 ceiling_date。重新考虑我自己的例子,我会选择 floor_date,因为 2021-08-30 7:24 属于 7:15-7:30 组。查看所有变体:
library(data.table)
setDT(df)
df[, Time := ymd_hm(Time)]
df[, round_date := lubridate::round_date(Time, "15 minutes")]
df[, floor_date := lubridate::floor_date(Time, "15 minutes")]
df[, ceiling_date := lubridate::ceiling_date(Time, "15 minutes")]
df[, .(Distance_m_sum = sum(Distance_m)), by = round_date]
round_date Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:15:00 324
4: 2021-08-30 08:30:00 162
5: 2021-08-31 02:45:00 469
6: 2021-08-31 03:00:00 137
7: 2021-08-31 07:45:00 302
8: 2021-08-31 06:00:00 42
df[, .(Distance_m_sum = sum(Distance_m)), by = floor_date]
floor_date Distance_m_sum
1: 2021-08-30 07:15:00 162
2: 2021-08-30 07:30:00 162
3: 2021-08-30 07:45:00 162
4: 2021-08-30 08:15:00 486
5: 2021-08-31 02:30:00 319
6: 2021-08-31 02:45:00 287
7: 2021-08-31 07:30:00 122
8: 2021-08-31 07:45:00 180
9: 2021-08-31 06:00:00 42
df[, .(Distance_m_sum = sum(Distance_m)), by = ceiling_date]
ceiling_date Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:30:00 486
4: 2021-08-31 02:45:00 319
5: 2021-08-31 03:00:00 287
6: 2021-08-31 07:45:00 224
7: 2021-08-31 08:00:00 78
8: 2021-08-31 06:15:00 42
Base R 选项使用 cut
以 15 分钟为间隔划分数据,并使用 aggregate
汇总数据。
df$Time <- as.POSIXct(df$Time, format = '%Y-%m-%d %H:%M', tz = 'UTC')
aggregate(Distance_m~Time_cut, transform(df, Time_cut = cut(Time, '15 mins')), sum)
# Time_cut Distance_m
#1 2021-08-30 07:24:00 324
#2 2021-08-30 07:54:00 162
#3 2021-08-30 08:09:00 324
#4 2021-08-30 08:24:00 162
#5 2021-08-31 02:39:00 469
#6 2021-08-31 02:54:00 137
#7 2021-08-31 05:54:00 42
#8 2021-08-31 07:39:00 302
您可能想知道 1900 部分,这是因为当切面 ggplot 时仍会考虑整个日期,因此您无法按小时很好地堆叠它们。当刻面时,也很难给出限制的开始和结束,因为它们落在不同的日子。另一种方法是按照您在日期和时间中建议的方式进行拆分,但这会使您的灵活性降低,并且会丢失您的时间表。
Time <- c("2021-08-30 7:24","2021-08-30 7:30","2021-08-30 7:54","2021-08-30 8:16","2021-08-30 8:27","2021-08-30 8:22","2021-08-31 2:39","2021-08-31 2:44","2021-08-31 2:50","2021-08-31 2:56","2021-08-31 7:42","2021-08-31 7:45","2021-08-31 7:50","2021-08-31 6:02")
Distance_m <- c(162,162,162,162,162,162,162,157,150,137,122,102,78,42)
df <- data.frame(Time, Distance_m)
library(data.table)
setDT(df)
df[, Time := ymd_hm(Time)]
df[, floor_date := lubridate::floor_date(Time, "15 minutes")]
df <- df[, .(Distance_m_sum = sum(Distance_m)), by = floor_date]
ggplot(df, aes(x= ymd_hms(paste("1900-01-01", str_sub(df$floor_date, 12))), y = Distance_m_sum, group = as.Date(floor_date))) + geom_line(size=1) + geom_point(size=3) +
facet_wrap(as.Date(floor_date) ~ ., ncol = 1) +
labs(title = "Daily Distance_m") +
expand_limits(x = c(ymd_h(1900010100), ymd_h(1900010200))) +
scale_x_datetime(date_breaks = "60 min", date_minor_breaks = "15 min", date_labels = "%H:%M", expand = c(0,0))
我有数据如下
Time <- c("2021-08-30 7:24","2021-08-30 7:30","2021-08-30 7:54","2021-08-30 8:16","2021-08-30 8:27","2021-08-30 8:22","2021-08-31 2:39","2021-08-31 2:44","2021-08-31 2:50","2021-08-31 2:56","2021-08-31 7:42","2021-08-31 7:45","2021-08-31 7:50","2021-08-31 6:02")
Distance_m <- c(162,162,162,162,162,162,162,157,150,137,122,102,78,42)
df <- data.frame(Time, Distance_m)
df
Time Distance_m
1 2021-08-30 7:24 162
2 2021-08-30 7:30 162
3 2021-08-30 7:54 162
4 2021-08-30 8:16 162
5 2021-08-30 8:27 162
6 2021-08-30 8:22 162
7 2021-08-31 2:39 162
8 2021-08-31 2:44 157
9 2021-08-31 2:50 150
10 2021-08-31 2:56 137
11 2021-08-31 7:42 122
12 2021-08-31 7:45 102
13 2021-08-31 7:50 78
14 2021-08-31 6:02 42
我想根据日期和小时以 15 分钟为间隔对 Distance_m 求和。
我期待如下输出
Date Hour Time Distance_m
2021-08-30 7 54 486
2021-08-30 8 30 486
2021-08-31 2 56 606
2021-08-31 6 2 344
到目前为止我已经试过了
df <- tidyr::separate(df, Time, c("Date", "Time"), sep = " ")
df1<- df %>%
mutate(Time = hm(Time)) %>%
mutate(ttt= (lubridate::minute(Time) + lubridate::hour(Time) * 60)) %>%
mutate(tt = floor(ttt/15) ) %>%
group_by(tt) %>%
summarize(Date = last(Date),Time = last(Time), Distance_m = sum(Distance_m))
但是输出有点乱。我希望在处理大量数据时找到一种有效的方法。
谢谢
虽然没有完全给出您的预期结果,但也许可用。 您可以看看这是否符合您的需求。
library(data.table)
setDT(df)
df[, Time := ymd_hm(Time)]
df[, groups := lubridate::round_date(Time, "15 minutes")]
df[, .(Distance_m_sum = sum(Distance_m)), by = groups]
groups Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:15:00 324
4: 2021-08-30 08:30:00 162
5: 2021-08-31 02:45:00 469
6: 2021-08-31 03:00:00 137
7: 2021-08-31 07:45:00 302
8: 2021-08-31 06:00:00 42
更多扩展示例
我认为你必须定义你的宿舍,lubridate 方法有三个选项,round_date、floor_date 和 ceiling_date。重新考虑我自己的例子,我会选择 floor_date,因为 2021-08-30 7:24 属于 7:15-7:30 组。查看所有变体:
library(data.table)
setDT(df)
df[, Time := ymd_hm(Time)]
df[, round_date := lubridate::round_date(Time, "15 minutes")]
df[, floor_date := lubridate::floor_date(Time, "15 minutes")]
df[, ceiling_date := lubridate::ceiling_date(Time, "15 minutes")]
df[, .(Distance_m_sum = sum(Distance_m)), by = round_date]
round_date Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:15:00 324
4: 2021-08-30 08:30:00 162
5: 2021-08-31 02:45:00 469
6: 2021-08-31 03:00:00 137
7: 2021-08-31 07:45:00 302
8: 2021-08-31 06:00:00 42
df[, .(Distance_m_sum = sum(Distance_m)), by = floor_date]
floor_date Distance_m_sum
1: 2021-08-30 07:15:00 162
2: 2021-08-30 07:30:00 162
3: 2021-08-30 07:45:00 162
4: 2021-08-30 08:15:00 486
5: 2021-08-31 02:30:00 319
6: 2021-08-31 02:45:00 287
7: 2021-08-31 07:30:00 122
8: 2021-08-31 07:45:00 180
9: 2021-08-31 06:00:00 42
df[, .(Distance_m_sum = sum(Distance_m)), by = ceiling_date]
ceiling_date Distance_m_sum
1: 2021-08-30 07:30:00 324
2: 2021-08-30 08:00:00 162
3: 2021-08-30 08:30:00 486
4: 2021-08-31 02:45:00 319
5: 2021-08-31 03:00:00 287
6: 2021-08-31 07:45:00 224
7: 2021-08-31 08:00:00 78
8: 2021-08-31 06:15:00 42
Base R 选项使用 cut
以 15 分钟为间隔划分数据,并使用 aggregate
汇总数据。
df$Time <- as.POSIXct(df$Time, format = '%Y-%m-%d %H:%M', tz = 'UTC')
aggregate(Distance_m~Time_cut, transform(df, Time_cut = cut(Time, '15 mins')), sum)
# Time_cut Distance_m
#1 2021-08-30 07:24:00 324
#2 2021-08-30 07:54:00 162
#3 2021-08-30 08:09:00 324
#4 2021-08-30 08:24:00 162
#5 2021-08-31 02:39:00 469
#6 2021-08-31 02:54:00 137
#7 2021-08-31 05:54:00 42
#8 2021-08-31 07:39:00 302
您可能想知道 1900 部分,这是因为当切面 ggplot 时仍会考虑整个日期,因此您无法按小时很好地堆叠它们。当刻面时,也很难给出限制的开始和结束,因为它们落在不同的日子。另一种方法是按照您在日期和时间中建议的方式进行拆分,但这会使您的灵活性降低,并且会丢失您的时间表。
Time <- c("2021-08-30 7:24","2021-08-30 7:30","2021-08-30 7:54","2021-08-30 8:16","2021-08-30 8:27","2021-08-30 8:22","2021-08-31 2:39","2021-08-31 2:44","2021-08-31 2:50","2021-08-31 2:56","2021-08-31 7:42","2021-08-31 7:45","2021-08-31 7:50","2021-08-31 6:02")
Distance_m <- c(162,162,162,162,162,162,162,157,150,137,122,102,78,42)
df <- data.frame(Time, Distance_m)
library(data.table)
setDT(df)
df[, Time := ymd_hm(Time)]
df[, floor_date := lubridate::floor_date(Time, "15 minutes")]
df <- df[, .(Distance_m_sum = sum(Distance_m)), by = floor_date]
ggplot(df, aes(x= ymd_hms(paste("1900-01-01", str_sub(df$floor_date, 12))), y = Distance_m_sum, group = as.Date(floor_date))) + geom_line(size=1) + geom_point(size=3) +
facet_wrap(as.Date(floor_date) ~ ., ncol = 1) +
labs(title = "Daily Distance_m") +
expand_limits(x = c(ymd_h(1900010100), ymd_h(1900010200))) +
scale_x_datetime(date_breaks = "60 min", date_minor_breaks = "15 min", date_labels = "%H:%M", expand = c(0,0))