用于从 R 中的时间线手动采样的 for 循环的省时替代方法
Time efficient alternative to for-loops for manual sampling from timelines in R
所以我在全年的x个时间点对一组湖泊进行了采样。我还在水中部署了记录器等,我想在访问 x days/hours 之前的时间点使用这些记录器的每日平均值。有时我也只是抓取访问时间点的样本。
这是我的解决方案,它工作得很好,但由于我对一些模型假设进行了大量试验并进行了敏感性分析,因此运行速度慢得令人不满意。
我似乎已经用循环解决了我的大部分 R 问题,而且我经常遇到更高效的脚本,看到我的代码的一些更有效的替代方案会非常有趣。
下面的代码只是生成了一些虚拟数据..
library(dplyr)
library(lubridate)
do.pct.sat <- function(x,y,z){
t <- x
do <- y
p <- z
atm <- (p*100)/101325
do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
do.pct.sat <- (do/do.sat)*100
return(do.pct.sat)
}#function for calculating the % oxygen saturation
#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines
#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
l <- length(datetime)#vector length of datetime
#set dummy data
do <- rnorm(l,mean = 10,sd=3)#o2 conc.
pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
temp <- rnorm(l,mean=15,sd=5)#water temp
k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
lake[[i]] <- iso#save the data frame to the lake logger list
samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
s[[i]] <- as.data.frame(samples)#save it in empty data frame
s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s)
samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute
下面是我想要实现的函数(同一个库)。我认为它运行缓慢,因为我一次约会一个,然后再约会下一个;
sample.lakes <- function(average=3){
dts <- list()#empty list
for(i in 1:length(lake)){
print(id[i])
data = lake[[i]]
y <- samples[grepl(id[i],samples$lake),]
dates <- y$samples
#empty vectors to fill with values sampled in loop
avg.kz <- vector()
sd.kz <- vector()
do.mgl <- vector()
dosat.pct <- vector()
temp.c <- vector()
for (k in 1:length(dates)){
print(k)
#below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
#fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables.
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct
}
sd.kz[is.na(sd.kz)] <- 0
#add them to data frame y
y$dosat.pct <- dosat.pct
y$do.mgl <- do.mgl
y$temp.c <- temp.c
y$avg.kz <- avg.kz
y$sd.kz <- sd.kz
dts[[i]] <- y#add to single-row dataframe
}
iso <- bind_rows(dts)#make a complete dataframe with samples.
return(iso)
}
iso <- sample.lakes(average=4)#do not set average to > 5 in this example script
如果有任何建议,我将不胜感激!
我猜这部分使用 grepl
:
data[grepl(dates[k],data$datetime),]
内部 for
循环很慢。
您不能改为尝试查看日期时间是否与 ==
相同吗?
此外,您只需要 data
子集一次。
试试这个作为替代方法:
for (k in 1:length(dates)){
print(k)
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
sub_data <- data[data$datetime == dates[k], ]
temp.c[k] <- sub_data$temp
do.mgl[k] <- sub_data$do
dosat.pct[k] <- sub_data$dosat.pct
}
所以我在全年的x个时间点对一组湖泊进行了采样。我还在水中部署了记录器等,我想在访问 x days/hours 之前的时间点使用这些记录器的每日平均值。有时我也只是抓取访问时间点的样本。
这是我的解决方案,它工作得很好,但由于我对一些模型假设进行了大量试验并进行了敏感性分析,因此运行速度慢得令人不满意。
我似乎已经用循环解决了我的大部分 R 问题,而且我经常遇到更高效的脚本,看到我的代码的一些更有效的替代方案会非常有趣。
下面的代码只是生成了一些虚拟数据..
library(dplyr)
library(lubridate)
do.pct.sat <- function(x,y,z){
t <- x
do <- y
p <- z
atm <- (p*100)/101325
do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
do.pct.sat <- (do/do.sat)*100
return(do.pct.sat)
}#function for calculating the % oxygen saturation
#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines
#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
l <- length(datetime)#vector length of datetime
#set dummy data
do <- rnorm(l,mean = 10,sd=3)#o2 conc.
pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
temp <- rnorm(l,mean=15,sd=5)#water temp
k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
lake[[i]] <- iso#save the data frame to the lake logger list
samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
s[[i]] <- as.data.frame(samples)#save it in empty data frame
s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s)
samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute
下面是我想要实现的函数(同一个库)。我认为它运行缓慢,因为我一次约会一个,然后再约会下一个;
sample.lakes <- function(average=3){
dts <- list()#empty list
for(i in 1:length(lake)){
print(id[i])
data = lake[[i]]
y <- samples[grepl(id[i],samples$lake),]
dates <- y$samples
#empty vectors to fill with values sampled in loop
avg.kz <- vector()
sd.kz <- vector()
do.mgl <- vector()
dosat.pct <- vector()
temp.c <- vector()
for (k in 1:length(dates)){
print(k)
#below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
#fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables.
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct
}
sd.kz[is.na(sd.kz)] <- 0
#add them to data frame y
y$dosat.pct <- dosat.pct
y$do.mgl <- do.mgl
y$temp.c <- temp.c
y$avg.kz <- avg.kz
y$sd.kz <- sd.kz
dts[[i]] <- y#add to single-row dataframe
}
iso <- bind_rows(dts)#make a complete dataframe with samples.
return(iso)
}
iso <- sample.lakes(average=4)#do not set average to > 5 in this example script
如果有任何建议,我将不胜感激!
我猜这部分使用 grepl
:
data[grepl(dates[k],data$datetime),]
内部 for
循环很慢。
您不能改为尝试查看日期时间是否与 ==
相同吗?
此外,您只需要 data
子集一次。
试试这个作为替代方法:
for (k in 1:length(dates)){
print(k)
prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
avg.kz[k] = mean(prior.days$k.z)
sd.kz[k] = sd(prior.days$k.z)
sub_data <- data[data$datetime == dates[k], ]
temp.c[k] <- sub_data$temp
do.mgl[k] <- sub_data$do
dosat.pct[k] <- sub_data$dosat.pct
}