是否有 R 函数来解析包含不同格式日期的向量?
Is there an R function to parse vector containing dates in different formats?
我正在处理一个包含 468006 个元素的向量,每个元素代表一个 date/time,采用两种格式之一
向量片段如下:
> result_date_time_vector[1810:1820]
[1] "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27"
[7] "1/3/2021" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57"
> class(result_date_time_vector)
[1] "character"
我想删除有关时间的信息,然后将元素转换为单一一致的格式。
我尝试了 for 循环,但过程非常缓慢(但没有收到任何错误或警告)
> fixed_result_date_time <- rep (NA, length(result_date_time_vector))
> class(fixed_result_date_time) <- "Date"
> for (n in 1:length(result_date_time_vector)){
if (is.na(result_date_time_vector[n])){
next
} else if (str_detect(result_date_time_vector[n], "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")){
fixed_result_date_time[n] <- as_date(ymd_hms(result_date_time_vector[n], tz = "America/New_York"))
} else {
fixed_result_date_time[n] <- as_date(mdy(result_date_time_vector[n], tz = "America/New_York"))
}
}
我也尝试了 ifelse 函数,过程很快(但收到了很多警告)。
> fixed_result_date_time <- ifelse(str_detect(result_date_time_vector, "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"),
as_date(ymd_hms(result_date_time_vector, tz = "America/New_York")),
as_date(mdy(result_date_time_vector, tz = "America/New_York")))
Warning: 20963 failed to parse.
Warning: 447043 failed to parse.
> class(fixed_result_date_time) <- "Date"
输入向量中m/d/y格式有20963个元素,y-m-dh:m:s格式有447043个元素。
是否有更有效的方法可以在没有警告的情况下完成相同的操作?
conv_dates <- function(dates, fmts = c("%Y-%m-%d", "%Y/%m/%d", "%d/%m/%Y", "%m/%d/%Y"), origin = "1900-01-01") {
out <- Sys.Date()[rep(NA, length(dates))]
notna0 <- !is.na(dates)
allnum <- notna0 & grepl("^[.0-9]+$", dates)
if (any(allnum)) out[allnum] <- suppressWarnings(as.Date(as.numeric(dates[allnum]), origin = origin))
for (fmt in fmts) {
isna <- notna0 & is.na(out)
if (!any(isna)) break
out[isna] <- as.Date(dates[isna], format = fmt)
}
out
}
result_date_time_vector <- c("2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "1/3/2021", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57")
conv_dates(gsub(" .*", "", result_date_time_vector))
# [1] "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "0001-03-20" "2021-01-03"
# [9] "2021-01-03" "2021-01-03" "2021-01-03"
包 lubridate
中有函数 parse_date_time
:
orders <- c("ymd HMS", "mdy")
lubridate::parse_date_time(x, orders = orders)
# [1] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
# [3] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
# [5] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
# [7] "2021-01-03 00:00:00 UTC" "2021-01-03 13:12:57 UTC"
# [9] "2021-01-03 13:12:57 UTC" "2021-01-03 13:12:57 UTC"
#[11] "2021-01-03 13:12:57 UTC"
数据
x<-'"2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27"
"1/3/2021" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57"'
x <- scan(text=x, what = character())
使用 anytime
包中的函数 anytime
:
anytime::anytime(dates)
[1] "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST"
[4] "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST"
[7] "2021-01-03 00:00:00 PST" "2021-01-03 13:12:57 PST" "2021-01-03 13:12:57 PST"
[10] "2021-01-03 13:12:57 PST" "2021-01-03 13:12:57 PST"
数据:
dates <- c("2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27",
"2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27",
"1/3/2021 ", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57",
"2021-01-03 13:12:57")
library(lubridate)
date(parse_date_time(vector, orders = c('ymd HMS', 'mdy')))
[1] "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03"
[7] "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03"
您也可以尝试 parsedate 软件包:
> result_date_time_vector <- c("2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "1/3/2021", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57")
> parsedate::parse_date(result_date_time_vector)
[1] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
[4] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
[7] "2021-01-03 00:00:00 UTC" "2021-01-03 13:12:57 UTC" "2021-01-03 13:12:57 UTC"
[10] "2021-01-03 13:12:57 UTC" "2021-01-03 13:12:57 UTC"
我正在处理一个包含 468006 个元素的向量,每个元素代表一个 date/time,采用两种格式之一
向量片段如下:
> result_date_time_vector[1810:1820]
[1] "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27"
[7] "1/3/2021" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57"
> class(result_date_time_vector)
[1] "character"
我想删除有关时间的信息,然后将元素转换为单一一致的格式。
我尝试了 for 循环,但过程非常缓慢(但没有收到任何错误或警告)
> fixed_result_date_time <- rep (NA, length(result_date_time_vector))
> class(fixed_result_date_time) <- "Date"
> for (n in 1:length(result_date_time_vector)){
if (is.na(result_date_time_vector[n])){
next
} else if (str_detect(result_date_time_vector[n], "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}")){
fixed_result_date_time[n] <- as_date(ymd_hms(result_date_time_vector[n], tz = "America/New_York"))
} else {
fixed_result_date_time[n] <- as_date(mdy(result_date_time_vector[n], tz = "America/New_York"))
}
}
我也尝试了 ifelse 函数,过程很快(但收到了很多警告)。
> fixed_result_date_time <- ifelse(str_detect(result_date_time_vector, "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"),
as_date(ymd_hms(result_date_time_vector, tz = "America/New_York")),
as_date(mdy(result_date_time_vector, tz = "America/New_York")))
Warning: 20963 failed to parse.
Warning: 447043 failed to parse.
> class(fixed_result_date_time) <- "Date"
输入向量中m/d/y格式有20963个元素,y-m-dh:m:s格式有447043个元素。
是否有更有效的方法可以在没有警告的情况下完成相同的操作?
conv_dates <- function(dates, fmts = c("%Y-%m-%d", "%Y/%m/%d", "%d/%m/%Y", "%m/%d/%Y"), origin = "1900-01-01") {
out <- Sys.Date()[rep(NA, length(dates))]
notna0 <- !is.na(dates)
allnum <- notna0 & grepl("^[.0-9]+$", dates)
if (any(allnum)) out[allnum] <- suppressWarnings(as.Date(as.numeric(dates[allnum]), origin = origin))
for (fmt in fmts) {
isna <- notna0 & is.na(out)
if (!any(isna)) break
out[isna] <- as.Date(dates[isna], format = fmt)
}
out
}
result_date_time_vector <- c("2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "1/3/2021", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57")
conv_dates(gsub(" .*", "", result_date_time_vector))
# [1] "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "0001-03-20" "2021-01-03"
# [9] "2021-01-03" "2021-01-03" "2021-01-03"
包 lubridate
中有函数 parse_date_time
:
orders <- c("ymd HMS", "mdy")
lubridate::parse_date_time(x, orders = orders)
# [1] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
# [3] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
# [5] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
# [7] "2021-01-03 00:00:00 UTC" "2021-01-03 13:12:57 UTC"
# [9] "2021-01-03 13:12:57 UTC" "2021-01-03 13:12:57 UTC"
#[11] "2021-01-03 13:12:57 UTC"
数据
x<-'"2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27" "2021-01-03 02:22:27"
"1/3/2021" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57" "2021-01-03 13:12:57"'
x <- scan(text=x, what = character())
使用 anytime
包中的函数 anytime
:
anytime::anytime(dates)
[1] "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST"
[4] "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST" "2021-01-03 02:22:27 PST"
[7] "2021-01-03 00:00:00 PST" "2021-01-03 13:12:57 PST" "2021-01-03 13:12:57 PST"
[10] "2021-01-03 13:12:57 PST" "2021-01-03 13:12:57 PST"
数据:
dates <- c("2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27",
"2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27",
"1/3/2021 ", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57",
"2021-01-03 13:12:57")
library(lubridate)
date(parse_date_time(vector, orders = c('ymd HMS', 'mdy')))
[1] "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03"
[7] "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03" "2021-01-03"
您也可以尝试 parsedate 软件包:
> result_date_time_vector <- c("2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "2021-01-03 02:22:27", "1/3/2021", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57", "2021-01-03 13:12:57")
> parsedate::parse_date(result_date_time_vector)
[1] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
[4] "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC" "2021-01-03 02:22:27 UTC"
[7] "2021-01-03 00:00:00 UTC" "2021-01-03 13:12:57 UTC" "2021-01-03 13:12:57 UTC"
[10] "2021-01-03 13:12:57 UTC" "2021-01-03 13:12:57 UTC"