R 每周平均
R weekly averaging
我有一组 34 年的网格化海面温度每日值(12418 个每日文件 x 4248 个点)并假装计算每周值。按照这个 post 我几乎成功了。但是日期和星期之间存在一些分歧。我找不到要点,我想确定我得到了正确的日期来计算每周平均值。
我使用我的这段 R 脚本读取每日数据并构建一个大数据框,其中包含列中单个点的所有每日值(12418 rows/days x 4248 columns/temperature)
# Paths
ruta_datos_diarios<-"/home/meteo/PROJECTES/VERSUS/DATA/SST/CSV/"
ruta_files<-"/home/meteo/PROJECTES/VERSUS/SCRIPTS/CLUSTER/FILES/"
ruta_eixida<-"/home/meteo/PROJECTES/VERSUS/OUTPUT/DATA/SEMANAL/"
# List of daily files
files <- list.files(path = ruta_datos_diarios, pattern = "SST-diaria-MED")
output <- matrix(ncol=4248, nrow=length(files))
fechas <- matrix(ncol=1, nrow=length(files))
for (i in 1:length(files)){
# read data
datos<-read.csv(paste0(ruta_datos_diarios,files[i],sep=""),header=TRUE,na.strings = "NA")
datos<-datos[complete.cases(datos),]
# Extract dates from daily file names
yyyy<-substr(files[i],16,19)
mm<-substr(files[i],20,21)
dd<-substr(files[i],22,23)
dates[i,]<-paste0(yyyy,"-",mm,"-",dd,sep="")
output[i,]<-t(datos$sst)
}
datos.df<-as.data.frame(output)
# Build a dataframe with the dates (day, week and year)
fechas<-as.data.frame(fechas)
fechas$V1<-as.Date(fechas$V1)
fechas$Week <- week(fechas$V1)
fechas$Year <- year(fechas$V1)
# Extract day of the week (Saturday = 6)
fechas$Week_Day <- as.numeric(format(fechas$V1, format='%w'))
# Adjust end-of-week date (first saturday from the original Date)
fechas$End_of_Week <- fechas$V1 + (6 - fechas$Week_Day)
# new dataframe from End_of_Week
fechas.semana<-fechas[!duplicated(fechas$End_of_Week),]
fechas.semana<-as.data.frame(fechas.semana)
colnames(fechas)<-c("Day","Week","Year","Week_Day","End_of_Week")
colnames(fechas.semana)<-c("Day","Week","Year","Week_Day","End_of_Week")
这就是我读取数据和日期的方式。为了保持一个简短的例子,我在这个文件 temp-sst.csv 中保存了数据框的一个子集(1000 个观察值,共 10 个变量,包括 "Day"、"Week"、"Year"、"Week_Day","End_of_Week").
sst.dat <- read.csv("temp-dat.csv",header=TRUE)
# Join dates and SST values
sst.dat <- cbind(fechas, sst.dat)
# Build new dates data frame
fechas<-as.data.frame(sst.dat$Day)
colnames(fechas)<-c("Day")
fechas$Day<-as.Date(fechas$Day)
fechas$Week <- week(fechas$Day)
fechas$Year <- year(fechas$Day)
# Extract day of the week (Saturday = 6)
fechas$Week_Day <- as.numeric(format(fechas$Day, format='%w'))
# Adjust end-of-week date (first saturday from the original Date)
fechas$End_of_Week <- fechas$Day + (6 - fechas$Week_Day)
fechas.semana<-fechas[!duplicated(fechas$End_of_Week),]
fechas.semana<-as.data.frame(fechas.semana)
colnames(fechas)<-c("Day","Week","Year","Week_Day","End_of_Week")
colnames(fechas.semana)<-c("Day","Week","Year","Week_Day","End_of_Week")
# Weekly aggregation function from the referred post
media.semanal <- function(x, column){
a<-aggregate(x[,column]~End_of_Week+Year, FUN=mean, data=x, na.rm=TRUE)
colnames(a)<-c("End_of_Week","Year","SSTmean")
return(a)
}
# Matrix to be populated by weekly function
SST.mat<-matrix(nrow=nrow(fechas.semana), ncol=length(sst.dat)-5) # 5 son las columnas de fecha
for (j in 6:length(sst.dat)){ # comienza en 6 para evitar las columnas de fecha
b<-media.semanal(sst.dat,j)
SST.mat[,j-5]<-b$SSTmean
}
但是问题来了。循环中的 "b" 数据帧有 145 行,而 SST.mat 和 fechas.semana 只有 144 行。我还没有找到分歧所在。
任何帮助将不胜感激,我被困在这里。
谢谢
您在 b$End_of_Week
的一个值中有一个副本。
首先我注意到集合成员没有区别:
setdiff(as.character(b$End_of_Week),as.character(fechas.semana$End_of_Week))
character(0)
然后我意识到这一定是因为重复,并这样确认:
table(table(as.character(b$End_of_Week))>1)
143 1
FALSE TRUE
查看 table 表明骗子是 1983-01-01
。
似乎根本原因是您按 End_of_Week + Year
进行汇总,其中 Year
是不必要的,因为 End_of_Week
中也包含年份,并且如果您仅按 [= 进行汇总18=] 你得到 144 而不是 145.
# Weekly aggregation function from the referred post
media.semanal <- function(x, column){
a<-aggregate(x[,column]~End_of_Week, FUN=mean, data=x, na.rm=TRUE)
colnames(a)<-c("End_of_Week","SSTmean")
return(a)
}
# Matrix to be populated by weekly function
SST.mat<-matrix(nrow=nrow(fechas.semana), ncol=length(sst.dat)-5) # 5 son las columnas de fecha
for (j in 6:length(sst.dat)){ # comienza en 6 para evitar las columnas de fecha
b<-media.semanal(sst.dat,j)
SST.mat[,j-5]<-b$SSTmean
}
dim(b)
144 2
我有一组 34 年的网格化海面温度每日值(12418 个每日文件 x 4248 个点)并假装计算每周值。按照这个 post 我几乎成功了。但是日期和星期之间存在一些分歧。我找不到要点,我想确定我得到了正确的日期来计算每周平均值。
我使用我的这段 R 脚本读取每日数据并构建一个大数据框,其中包含列中单个点的所有每日值(12418 rows/days x 4248 columns/temperature)
# Paths
ruta_datos_diarios<-"/home/meteo/PROJECTES/VERSUS/DATA/SST/CSV/"
ruta_files<-"/home/meteo/PROJECTES/VERSUS/SCRIPTS/CLUSTER/FILES/"
ruta_eixida<-"/home/meteo/PROJECTES/VERSUS/OUTPUT/DATA/SEMANAL/"
# List of daily files
files <- list.files(path = ruta_datos_diarios, pattern = "SST-diaria-MED")
output <- matrix(ncol=4248, nrow=length(files))
fechas <- matrix(ncol=1, nrow=length(files))
for (i in 1:length(files)){
# read data
datos<-read.csv(paste0(ruta_datos_diarios,files[i],sep=""),header=TRUE,na.strings = "NA")
datos<-datos[complete.cases(datos),]
# Extract dates from daily file names
yyyy<-substr(files[i],16,19)
mm<-substr(files[i],20,21)
dd<-substr(files[i],22,23)
dates[i,]<-paste0(yyyy,"-",mm,"-",dd,sep="")
output[i,]<-t(datos$sst)
}
datos.df<-as.data.frame(output)
# Build a dataframe with the dates (day, week and year)
fechas<-as.data.frame(fechas)
fechas$V1<-as.Date(fechas$V1)
fechas$Week <- week(fechas$V1)
fechas$Year <- year(fechas$V1)
# Extract day of the week (Saturday = 6)
fechas$Week_Day <- as.numeric(format(fechas$V1, format='%w'))
# Adjust end-of-week date (first saturday from the original Date)
fechas$End_of_Week <- fechas$V1 + (6 - fechas$Week_Day)
# new dataframe from End_of_Week
fechas.semana<-fechas[!duplicated(fechas$End_of_Week),]
fechas.semana<-as.data.frame(fechas.semana)
colnames(fechas)<-c("Day","Week","Year","Week_Day","End_of_Week")
colnames(fechas.semana)<-c("Day","Week","Year","Week_Day","End_of_Week")
这就是我读取数据和日期的方式。为了保持一个简短的例子,我在这个文件 temp-sst.csv 中保存了数据框的一个子集(1000 个观察值,共 10 个变量,包括 "Day"、"Week"、"Year"、"Week_Day","End_of_Week").
sst.dat <- read.csv("temp-dat.csv",header=TRUE)
# Join dates and SST values
sst.dat <- cbind(fechas, sst.dat)
# Build new dates data frame
fechas<-as.data.frame(sst.dat$Day)
colnames(fechas)<-c("Day")
fechas$Day<-as.Date(fechas$Day)
fechas$Week <- week(fechas$Day)
fechas$Year <- year(fechas$Day)
# Extract day of the week (Saturday = 6)
fechas$Week_Day <- as.numeric(format(fechas$Day, format='%w'))
# Adjust end-of-week date (first saturday from the original Date)
fechas$End_of_Week <- fechas$Day + (6 - fechas$Week_Day)
fechas.semana<-fechas[!duplicated(fechas$End_of_Week),]
fechas.semana<-as.data.frame(fechas.semana)
colnames(fechas)<-c("Day","Week","Year","Week_Day","End_of_Week")
colnames(fechas.semana)<-c("Day","Week","Year","Week_Day","End_of_Week")
# Weekly aggregation function from the referred post
media.semanal <- function(x, column){
a<-aggregate(x[,column]~End_of_Week+Year, FUN=mean, data=x, na.rm=TRUE)
colnames(a)<-c("End_of_Week","Year","SSTmean")
return(a)
}
# Matrix to be populated by weekly function
SST.mat<-matrix(nrow=nrow(fechas.semana), ncol=length(sst.dat)-5) # 5 son las columnas de fecha
for (j in 6:length(sst.dat)){ # comienza en 6 para evitar las columnas de fecha
b<-media.semanal(sst.dat,j)
SST.mat[,j-5]<-b$SSTmean
}
但是问题来了。循环中的 "b" 数据帧有 145 行,而 SST.mat 和 fechas.semana 只有 144 行。我还没有找到分歧所在。
任何帮助将不胜感激,我被困在这里。 谢谢
您在 b$End_of_Week
的一个值中有一个副本。
首先我注意到集合成员没有区别:
setdiff(as.character(b$End_of_Week),as.character(fechas.semana$End_of_Week))
character(0)
然后我意识到这一定是因为重复,并这样确认:
table(table(as.character(b$End_of_Week))>1)
143 1 FALSE TRUE
查看 table 表明骗子是 1983-01-01
。
似乎根本原因是您按 End_of_Week + Year
进行汇总,其中 Year
是不必要的,因为 End_of_Week
中也包含年份,并且如果您仅按 [= 进行汇总18=] 你得到 144 而不是 145.
# Weekly aggregation function from the referred post
media.semanal <- function(x, column){
a<-aggregate(x[,column]~End_of_Week, FUN=mean, data=x, na.rm=TRUE)
colnames(a)<-c("End_of_Week","SSTmean")
return(a)
}
# Matrix to be populated by weekly function
SST.mat<-matrix(nrow=nrow(fechas.semana), ncol=length(sst.dat)-5) # 5 son las columnas de fecha
for (j in 6:length(sst.dat)){ # comienza en 6 para evitar las columnas de fecha
b<-media.semanal(sst.dat,j)
SST.mat[,j-5]<-b$SSTmean
}
dim(b)
144 2