dplyr 总结字符时间变量
dplyr summarise character time variable
我有这样的数据
ID Time
456 0:00:01
456 0:02:05
123 0:00:14
756 0:03:47
756 0:01:56
756 0:00:01
其中 Time 是 typeof = character
我需要按 ID 对时间列求和,所以我最终得到:
ID Time Total_Time
456 0:00:01 0:02:06
456 0:02:05 0:02:06
123 0:00:14 0:00:14
756 0:03:47 0:05:44
756 0:01:56 0:05:44
756 0:00:01 0:05:44
我知道我可以使用 dplyr
来聚合但是当我 运行:
df$Total_Time <- df %>% group_by(ID) %>% summarise(Freq = sum(Time))
我得到一个错误,可能是因为时间是一个字符?
我能想到用lubridate::hms
把那些字符串转成数字,但是我还没找到正确的方法format(.., format="%H:%M:%S")
又返回,所以这是我用于各种相关目的的两个函数:
## simply convert "01:23:45" to 5025 (seconds) and "00:17:14.842" to 1034.842
time2num <- function(x) {
vapply(strsplit(x, ':'), function(y) sum(as.numeric(y) * c(60*60, 60, 1)),
numeric(1), USE.NAMES=FALSE)
}
## and back again
num2time <- function(x, digits.secs = getOption("digits.secs", 3)) {
hr <- as.integer(x %/% 3600)
min <- as.integer((x - 3600*hr) %/% 60)
sec <- (x - 3600*hr - 60*min)
if (anyNA(digits.secs)) {
# a mostly-arbitrary determination of significant digits,
# motivated by @Roland
for (digits.secs in 1:6) {
if (any(abs(signif(sec, digits.secs) - sec) > (10^(-3 - digits.secs)))) next
digits.secs <- digits.secs - 1L
break
}
}
sec <- sprintf(paste0("%02.", digits.secs[[1]], "f"), sec)
sec <- paste0(ifelse(grepl("^[0-9]\.", sec), "0", ""), sec)
out <- sprintf("%02i:%02i:%s", hr, min, sec)
out[is.na(x)] <- NA_character_
out
}
有了这些,
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Freq = num2time(sum(time2num(Time)), digits = 0)) %>%
ungroup()
# # A tibble: 6 x 3
# ID Time Freq
# <int> <chr> <chr>
# 1 456 0:00:01 00:02:06
# 2 456 0:02:05 00:02:06
# 3 123 0:00:14 00:00:14
# 4 756 0:03:47 00:05:44
# 5 756 0:01:56 00:05:44
# 6 756 0:00:01 00:05:44
数据
dat <- structure(list(ID = c(456L, 456L, 123L, 756L, 756L, 756L), Time = c("0:00:01", "0:02:05", "0:00:14", "0:03:47", "0:01:56", "0:00:01")), class = "data.frame", row.names = c(NA, -6L))
@r2evans 放在一起很棒。但是,由于我已经将它们放在一起,这里还有另一个可行的选项:
library(hms)
library(tidyverse)
# create arbitrary data with duplicate ids and different lengths of time
df1 <- data.frame(ID = rep(c(111, 222, 444, 666, 777, 888), each = 2),
Time = paste0("01:",10:21,":09") %>% as_hms())
# ID Time
# 1 111 01:10:09
# 2 111 01:11:09
# 3 222 01:12:09
# 4 222 01:13:09
# 5 444 01:14:09
# 6 444 01:15:09
# 7 666 01:16:09
# 8 666 01:17:09
# 9 777 01:18:09
# 10 777 01:19:09
# 11 888 01:20:09
# 12 888 01:21:09
df1 %>%
group_by(ID) %>%
summarise(Total_Time = sum(Time) %>% as_hms())
# # A tibble: 6 × 2
# ID Total_Time
# <dbl> <time>
# 1 111 02:21:18
# 2 222 02:25:18
# 3 444 02:29:18
# 4 666 02:33:18
# 5 777 02:37:18
# 6 888 02:41:18
我有这样的数据
ID Time
456 0:00:01
456 0:02:05
123 0:00:14
756 0:03:47
756 0:01:56
756 0:00:01
其中 Time 是 typeof = character
我需要按 ID 对时间列求和,所以我最终得到:
ID Time Total_Time
456 0:00:01 0:02:06
456 0:02:05 0:02:06
123 0:00:14 0:00:14
756 0:03:47 0:05:44
756 0:01:56 0:05:44
756 0:00:01 0:05:44
我知道我可以使用 dplyr
来聚合但是当我 运行:
df$Total_Time <- df %>% group_by(ID) %>% summarise(Freq = sum(Time))
我得到一个错误,可能是因为时间是一个字符?
我能想到用lubridate::hms
把那些字符串转成数字,但是我还没找到正确的方法format(.., format="%H:%M:%S")
又返回,所以这是我用于各种相关目的的两个函数:
## simply convert "01:23:45" to 5025 (seconds) and "00:17:14.842" to 1034.842
time2num <- function(x) {
vapply(strsplit(x, ':'), function(y) sum(as.numeric(y) * c(60*60, 60, 1)),
numeric(1), USE.NAMES=FALSE)
}
## and back again
num2time <- function(x, digits.secs = getOption("digits.secs", 3)) {
hr <- as.integer(x %/% 3600)
min <- as.integer((x - 3600*hr) %/% 60)
sec <- (x - 3600*hr - 60*min)
if (anyNA(digits.secs)) {
# a mostly-arbitrary determination of significant digits,
# motivated by @Roland
for (digits.secs in 1:6) {
if (any(abs(signif(sec, digits.secs) - sec) > (10^(-3 - digits.secs)))) next
digits.secs <- digits.secs - 1L
break
}
}
sec <- sprintf(paste0("%02.", digits.secs[[1]], "f"), sec)
sec <- paste0(ifelse(grepl("^[0-9]\.", sec), "0", ""), sec)
out <- sprintf("%02i:%02i:%s", hr, min, sec)
out[is.na(x)] <- NA_character_
out
}
有了这些,
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Freq = num2time(sum(time2num(Time)), digits = 0)) %>%
ungroup()
# # A tibble: 6 x 3
# ID Time Freq
# <int> <chr> <chr>
# 1 456 0:00:01 00:02:06
# 2 456 0:02:05 00:02:06
# 3 123 0:00:14 00:00:14
# 4 756 0:03:47 00:05:44
# 5 756 0:01:56 00:05:44
# 6 756 0:00:01 00:05:44
数据
dat <- structure(list(ID = c(456L, 456L, 123L, 756L, 756L, 756L), Time = c("0:00:01", "0:02:05", "0:00:14", "0:03:47", "0:01:56", "0:00:01")), class = "data.frame", row.names = c(NA, -6L))
@r2evans 放在一起很棒。但是,由于我已经将它们放在一起,这里还有另一个可行的选项:
library(hms)
library(tidyverse)
# create arbitrary data with duplicate ids and different lengths of time
df1 <- data.frame(ID = rep(c(111, 222, 444, 666, 777, 888), each = 2),
Time = paste0("01:",10:21,":09") %>% as_hms())
# ID Time
# 1 111 01:10:09
# 2 111 01:11:09
# 3 222 01:12:09
# 4 222 01:13:09
# 5 444 01:14:09
# 6 444 01:15:09
# 7 666 01:16:09
# 8 666 01:17:09
# 9 777 01:18:09
# 10 777 01:19:09
# 11 888 01:20:09
# 12 888 01:21:09
df1 %>%
group_by(ID) %>%
summarise(Total_Time = sum(Time) %>% as_hms())
# # A tibble: 6 × 2
# ID Total_Time
# <dbl> <time>
# 1 111 02:21:18
# 2 222 02:25:18
# 3 444 02:29:18
# 4 666 02:33:18
# 5 777 02:37:18
# 6 888 02:41:18