在 base R 中快速高效地将多列中的 Year、Julien 和 Time 转换为 POSIXct
Convert Year, Julien, & Time to POSIXct from multiple columns quickly and efficiently in base R
我有很多大型数据帧(+500,000 行),这些数据帧的日期时间信息存储在多个列中。它不是 MM/DD/YYYY 格式,而是在一列中包含年份,在下一列中包含朱利安日历日,在第三列中包含时间。数据结构如下:
df<-data.frame(YEAR = sample(2000:2020,10000, replace=T),
JULIEN = sample(1:365,10000,replace=T),
Time = sample(0:59,10000,replace = T),
dataVar1 = runif(10000,1.0,10.0),
dataVar2 = runif(10000,20.0,100.0))
到目前为止,我已经解决了这个问题:
timeR<-vector()
for (i in 1:dim(df)[1]){
currentTime<-paste(as.Date(df$JULIEN[i], origin=paste(df$YEAR[i]-1,"-12-31", sep = "")),formatC(df$Time[i], width = 4, format = "d", flag = "0"))
timeR<-c(timeR,currentTime)
}
df<-cbind(timeR,df[, ! names(df) %in% c("YEAR","JULIEN","Time")])
df$timeR<-as.POSIXct(df$timeR,format = "%Y-%m-%d %H%M", tz = "EST")
rm(timeR,i,currentTime)
但这需要大量时间。关于如何使 运行 更快的任何想法?谢谢。
paste
和 as.Date
被矢量化
v1 <- as.Date(df$JULIEN, origin = paste0(df$YEAR-1,"-12-31"))
currentTime <- paste(v1, formatC(df$Time, width = 4, format = "d", flag = "0"))
基准
system.time({
timeR<-vector()
for (i in 1:dim(df)[1]){
currentTime<-paste(as.Date(df$JULIEN[i], origin=paste(df$YEAR[i]-1,"-12-31", sep = "")),formatC(df$Time[i], width = 4, format = "d", flag = "0"))
timeR<-c(timeR,currentTime)
}})
# user system elapsed #
# 1.300 0.061 1.366
system.time({
v1 <- as.Date(df$JULIEN, origin = paste0(df$YEAR-1,"-12-31"))
currentTime <- paste(v1, formatC(df$Time, width = 4, format = "d", flag = "0"))
})
# user system elapsed
# 0.076 0.004 0.080
我有很多大型数据帧(+500,000 行),这些数据帧的日期时间信息存储在多个列中。它不是 MM/DD/YYYY 格式,而是在一列中包含年份,在下一列中包含朱利安日历日,在第三列中包含时间。数据结构如下:
df<-data.frame(YEAR = sample(2000:2020,10000, replace=T),
JULIEN = sample(1:365,10000,replace=T),
Time = sample(0:59,10000,replace = T),
dataVar1 = runif(10000,1.0,10.0),
dataVar2 = runif(10000,20.0,100.0))
到目前为止,我已经解决了这个问题:
timeR<-vector()
for (i in 1:dim(df)[1]){
currentTime<-paste(as.Date(df$JULIEN[i], origin=paste(df$YEAR[i]-1,"-12-31", sep = "")),formatC(df$Time[i], width = 4, format = "d", flag = "0"))
timeR<-c(timeR,currentTime)
}
df<-cbind(timeR,df[, ! names(df) %in% c("YEAR","JULIEN","Time")])
df$timeR<-as.POSIXct(df$timeR,format = "%Y-%m-%d %H%M", tz = "EST")
rm(timeR,i,currentTime)
但这需要大量时间。关于如何使 运行 更快的任何想法?谢谢。
paste
和 as.Date
被矢量化
v1 <- as.Date(df$JULIEN, origin = paste0(df$YEAR-1,"-12-31"))
currentTime <- paste(v1, formatC(df$Time, width = 4, format = "d", flag = "0"))
基准
system.time({
timeR<-vector()
for (i in 1:dim(df)[1]){
currentTime<-paste(as.Date(df$JULIEN[i], origin=paste(df$YEAR[i]-1,"-12-31", sep = "")),formatC(df$Time[i], width = 4, format = "d", flag = "0"))
timeR<-c(timeR,currentTime)
}})
# user system elapsed #
# 1.300 0.061 1.366
system.time({
v1 <- as.Date(df$JULIEN, origin = paste0(df$YEAR-1,"-12-31"))
currentTime <- paste(v1, formatC(df$Time, width = 4, format = "d", flag = "0"))
})
# user system elapsed
# 0.076 0.004 0.080