将大型字符转换为 r 中类似日期格式的字符
Convert a large scale characters to date-format-like characters in r
我有一个包含 1000 万行的数据框 df
。我想将 "birthday" 列的字符格式从 "xxxxxxxx" 转换为 "xxxx-xx-xx"。例如。从“20051023”到“2005-10-23”。我可以使用 df$birthday <- lapply(df$birthday, as.Date, "%Y%m%d")
来做到这一点,但它会浪费大量内存和计算时间来进行数据转换。但是,我只想将其转换为类似日期的字符,而不是日期类型。所以我使用 stringi
包,因为它是用 C 语言编写的。不幸的是,df$birthday <- stri_join(stri_sub(df$birthday, from=c(1,5,7), to=c(4,6,8)), collapse = "-")
不起作用,因为该函数不支持矢量输入。有什么办法可以解决这个问题吗?非常感谢。
与子一起去。
date <- c("20051023", "20151023")
sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", date)
# [1] "2005-10-23" "2015-10-23"
as.Date
适用于向量
df$birthday <- format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m-%d)
矢量化函数比应用快得多
library(microbenchmark)
n <- 1e3
df <- data.frame(birthday = rep("20051023", n))
microbenchmark(
lapply(df$birthday, as.Date, "%Y%m%d"),
sapply(df$birthday, as.Date, "%Y%m%d"),
as.Date(df$birthday, "%Y%m%d")
)
Unit: microseconds
expr min lq mean median uq max neval cld
lapply(df$birthday, as.Date, "%Y%m%d") 22833.624 25340.118 29064.7188 28406.154 32346.245 58522.360 100 b
sapply(df$birthday, as.Date, "%Y%m%d") 24048.493 26252.660 29797.9074 28437.156 33119.381 47966.133 100 b
as.Date(df$birthday, "%Y%m%d") 431.469 447.719 481.5221 461.189 475.086 1984.158 100 a
正则表达式偏离路线甚至更快。
microbenchmark(
as.character(as.Date(df$birthday, "%Y%m%d")),
format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d"),
sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", df$birthday)
)
Unit: microseconds
expr min lq mean
as.character(as.Date(df$birthday, "%Y%m%d")) 4923.189 5057.462 5390.313
format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d") 3428.657 3553.736 3697.660
sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", df$birthday) 713.699 739.997 815.737
median uq max neval cld
5150.0420 5394.4265 8225.270 100 c
3594.7875 3665.9865 5753.200 100 b
763.0885 783.1865 2433.585 100 a
sub()
适用于矩阵,但不适用于 data.frames。因此 as.matrix
df <- as.data.frame(matrix("20051023", ncol = 3, nrow = 3))
df$ID <- seq_len(nrow(df))
df[, 1:3] <- sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", as.matrix(df[, 1:3]))
矩阵求解比for循环更快。差异随着您需要循环的列数而增加。
df <- as.data.frame(matrix("20051023", ncol = 20, nrow = 3))
df$ID <- seq_len(nrow(df))
library(microbenchmark)
microbenchmark(
matrix = df[, seq_len(ncol(df) - 1)] <- sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", as.matrix(df[, seq_len(ncol(df) - 1)])),
forloop = {
for(i in seq_len(ncol(df) - 1)){
df[, i] <- sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", df[, i])
}
}
)
Unit: microseconds
expr min lq mean median uq max neval cld
matrix 460.555 476.805 504.3012 494.1235 507.594 1122.522 100 a
forloop 1554.425 1590.774 1677.3038 1625.8390 1670.312 3563.845 100 b
我有一个包含 1000 万行的数据框 df
。我想将 "birthday" 列的字符格式从 "xxxxxxxx" 转换为 "xxxx-xx-xx"。例如。从“20051023”到“2005-10-23”。我可以使用 df$birthday <- lapply(df$birthday, as.Date, "%Y%m%d")
来做到这一点,但它会浪费大量内存和计算时间来进行数据转换。但是,我只想将其转换为类似日期的字符,而不是日期类型。所以我使用 stringi
包,因为它是用 C 语言编写的。不幸的是,df$birthday <- stri_join(stri_sub(df$birthday, from=c(1,5,7), to=c(4,6,8)), collapse = "-")
不起作用,因为该函数不支持矢量输入。有什么办法可以解决这个问题吗?非常感谢。
与子一起去。
date <- c("20051023", "20151023")
sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", date)
# [1] "2005-10-23" "2015-10-23"
as.Date
适用于向量
df$birthday <- format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m-%d)
矢量化函数比应用快得多
library(microbenchmark)
n <- 1e3
df <- data.frame(birthday = rep("20051023", n))
microbenchmark(
lapply(df$birthday, as.Date, "%Y%m%d"),
sapply(df$birthday, as.Date, "%Y%m%d"),
as.Date(df$birthday, "%Y%m%d")
)
Unit: microseconds
expr min lq mean median uq max neval cld
lapply(df$birthday, as.Date, "%Y%m%d") 22833.624 25340.118 29064.7188 28406.154 32346.245 58522.360 100 b
sapply(df$birthday, as.Date, "%Y%m%d") 24048.493 26252.660 29797.9074 28437.156 33119.381 47966.133 100 b
as.Date(df$birthday, "%Y%m%d") 431.469 447.719 481.5221 461.189 475.086 1984.158 100 a
正则表达式偏离路线甚至更快。
microbenchmark(
as.character(as.Date(df$birthday, "%Y%m%d")),
format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d"),
sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", df$birthday)
)
Unit: microseconds
expr min lq mean
as.character(as.Date(df$birthday, "%Y%m%d")) 4923.189 5057.462 5390.313
format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d") 3428.657 3553.736 3697.660
sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", df$birthday) 713.699 739.997 815.737
median uq max neval cld
5150.0420 5394.4265 8225.270 100 c
3594.7875 3665.9865 5753.200 100 b
763.0885 783.1865 2433.585 100 a
sub()
适用于矩阵,但不适用于 data.frames。因此 as.matrix
df <- as.data.frame(matrix("20051023", ncol = 3, nrow = 3))
df$ID <- seq_len(nrow(df))
df[, 1:3] <- sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", as.matrix(df[, 1:3]))
矩阵求解比for循环更快。差异随着您需要循环的列数而增加。
df <- as.data.frame(matrix("20051023", ncol = 20, nrow = 3))
df$ID <- seq_len(nrow(df))
library(microbenchmark)
microbenchmark(
matrix = df[, seq_len(ncol(df) - 1)] <- sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", as.matrix(df[, seq_len(ncol(df) - 1)])),
forloop = {
for(i in seq_len(ncol(df) - 1)){
df[, i] <- sub("^(\d{4})(\d{2})(\d{2})$", "\1-\2-\3", df[, i])
}
}
)
Unit: microseconds
expr min lq mean median uq max neval cld
matrix 460.555 476.805 504.3012 494.1235 507.594 1122.522 100 a
forloop 1554.425 1590.774 1677.3038 1625.8390 1670.312 3563.845 100 b