dplyr 总结字符时间变量

dplyr summarise character time variable

我有这样的数据

ID   Time
456  0:00:01
456  0:02:05
123  0:00:14
756  0:03:47
756  0:01:56
756  0:00:01

其中 Time 是 typeof = character

我需要按 ID 对时间列求和,所以我最终得到:

ID   Time     Total_Time
456  0:00:01  0:02:06
456  0:02:05  0:02:06
123  0:00:14  0:00:14
756  0:03:47  0:05:44
756  0:01:56  0:05:44
756  0:00:01  0:05:44

我知道我可以使用 dplyr 来聚合但是当我 运行:

df$Total_Time <- df %>% group_by(ID) %>% summarise(Freq = sum(Time))

我得到一个错误,可能是因为时间是一个字符?

我能想到用lubridate::hms把那些字符串转成数字,但是我还没找到正确的方法format(.., format="%H:%M:%S")返回,所以这是我用于各种相关目的的两个函数:

## simply convert "01:23:45" to 5025 (seconds) and "00:17:14.842" to 1034.842
time2num <- function(x) {
  vapply(strsplit(x, ':'), function(y) sum(as.numeric(y) * c(60*60, 60, 1)),
         numeric(1), USE.NAMES=FALSE)
}

## and back again
num2time <- function(x, digits.secs = getOption("digits.secs", 3)) {
  hr <- as.integer(x %/% 3600)
  min <- as.integer((x - 3600*hr) %/% 60)
  sec <- (x - 3600*hr - 60*min)
  if (anyNA(digits.secs)) {
    # a mostly-arbitrary determination of significant digits,
    # motivated by @Roland 
    for (digits.secs in 1:6) {
      if (any(abs(signif(sec, digits.secs) - sec) > (10^(-3 - digits.secs)))) next
      digits.secs <- digits.secs - 1L
      break
    }
  }
  sec <- sprintf(paste0("%02.", digits.secs[[1]], "f"), sec)
  sec <- paste0(ifelse(grepl("^[0-9]\.", sec), "0", ""), sec)
  out <- sprintf("%02i:%02i:%s", hr, min, sec)
  out[is.na(x)] <- NA_character_
  out
}

有了这些,

library(dplyr)
df %>%
  group_by(ID) %>%
  mutate(Freq = num2time(sum(time2num(Time)), digits = 0)) %>%
  ungroup()
# # A tibble: 6 x 3
#      ID Time    Freq    
#   <int> <chr>   <chr>   
# 1   456 0:00:01 00:02:06
# 2   456 0:02:05 00:02:06
# 3   123 0:00:14 00:00:14
# 4   756 0:03:47 00:05:44
# 5   756 0:01:56 00:05:44
# 6   756 0:00:01 00:05:44

数据

dat <- structure(list(ID = c(456L, 456L, 123L, 756L, 756L, 756L), Time = c("0:00:01", "0:02:05", "0:00:14", "0:03:47", "0:01:56", "0:00:01")), class = "data.frame", row.names = c(NA, -6L))

@r2evans 放在一起很棒。但是,由于我已经将它们放在一起,这里还有另一个可行的选项:

library(hms)
library(tidyverse)

# create arbitrary data with duplicate ids and different lengths of time
df1 <- data.frame(ID = rep(c(111, 222, 444, 666, 777, 888), each = 2),
                  Time = paste0("01:",10:21,":09") %>% as_hms())
#     ID     Time
# 1  111 01:10:09
# 2  111 01:11:09
# 3  222 01:12:09
# 4  222 01:13:09
# 5  444 01:14:09
# 6  444 01:15:09
# 7  666 01:16:09
# 8  666 01:17:09
# 9  777 01:18:09
# 10 777 01:19:09
# 11 888 01:20:09
# 12 888 01:21:09 


df1 %>% 
  group_by(ID) %>% 
  summarise(Total_Time = sum(Time) %>% as_hms())
# # A tibble: 6 × 2
#      ID Total_Time
#   <dbl> <time>    
# 1   111 02:21:18  
# 2   222 02:25:18  
# 3   444 02:29:18  
# 4   666 02:33:18  
# 5   777 02:37:18  
# 6   888 02:41:18