避免对 cumsum 使用 for 循环
Avoiding the use of for loop for cumsum
首先生成一些示例数据:
doy <- rep(1:365,times=2)
year <- rep(2000:2001,each=365)
set.seed(1)
value <-runif(min=0,max=10,365*2)
doy.range <- c(40,50,60,80)
thres <- 200
df <- data.frame(cbind(doy,year,value))
我想做的是:
对于df$year == 2000
,从doy.range == 40
开始添加
df$value
,当df$value
的累加和>=thres
时,计算df$doy
这是我长期 for loop
实现的目标:
# create a matrix to store results
mat <- matrix(, nrow = length(doy.range)*length(unique(year)),ncol=3)
mat[,1] <- rep(unique(year),each=4)
mat[,2] <- rep(doy.range,times=2)
for(i in unique(df$year)){
dat <- df[df$year== i,]
for(j in doy.range){
dat1 <- dat[dat$doy >= j,]
dat1$cum.sum <-cumsum(dat1$value)
day.thres <- dat1[dat1$cum.sum >= thres,"doy"][1] # gives me the doy of the year where cumsum of df$value becomes >= thres
mat[mat[,2] == j & mat[,1] == i,3] <- day.thres
}
}
这个循环在我的矩阵的第三列中给出了 doy
当 cumsum$value
超过 thres
不过,我真的很想避免循环。有什么方法可以使用更少的代码来做到这一点吗?
如果我没理解错你可以使用dplyr
。假设阈值为 200:
library(dplyr)
df %>% group_by(year) %>%
filter(doy >= 40) %>%
mutate(CumSum = cumsum(value)) %>%
filter(CumSum >= 200) %>%
top_n(n = -1, wt = CumSum)
产生
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 78 2000 3.899895 201.4864
2 75 2001 9.205178 204.3171
我猜所使用的动词是不言自明的。如果没有,请告诉我。
对于不同的doy创建一个函数并使用lapply
:
f <- function(doy.range) {
df %>% group_by(year) %>%
filter(doy >= doy.range) %>%
mutate(CumSum = cumsum(value)) %>%
filter(CumSum >= 200) %>%
top_n(n = -1, wt = CumSum)
}
lapply(doy.range, f)
[[1]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 78 2000 3.899895 201.4864
2 75 2001 9.205178 204.3171
[[2]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 89 2000 2.454885 200.2998
2 91 2001 6.578281 200.6544
[[3]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 98 2000 4.100841 200.5048
2 102 2001 7.158333 200.3770
[[4]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 120 2000 6.401010 204.9951
2 120 2001 5.884192 200.8252
我们的想法是创建一个函数,该函数基于给定的(起始)doy 和阈值获取相关信息。然后将此函数应用于起始 doys 和阈值的不同组合,并获得包含所有相关信息的数据集:
# create example data
doy <- rep(1:365,times=2)
year <- rep(2000:2001,each=365)
set.seed(1)
value <-runif(min=0,max=10,365*2)
df <- data.frame(doy,year,value)
library(dplyr)
library(purrr)
# function (inputs: dr for doy range and t for threshold)
f = function(dr, t) {
df %>%
filter(doy >= dr) %>% # keep rows with values aboven a given doy
group_by(year) %>% # for each year
mutate(CumSumValue = cumsum(value)) %>% # get the cumulative sum of value
filter(CumSumValue >= t) %>% # keep rows equal or above a given threshold
slice(1) %>% # keep the first row
ungroup() %>% # forget the grouping
select(-value) %>% # remove unnecessary variable
mutate(doy_input=dr, thres_input=t) %>% # add the input info as columns
select(doy_input, thres_input, year, doy, CumSumValue) # re arrange columns
}
# input doy and threshold
doy.range <- c(40,50,60,80)
thres <- 200
# map those vectors to the function
map2_df(doy.range, thres, f)
# # A tibble: 8 x 5
# doy_input thres_input year doy CumSumValue
# <dbl> <dbl> <int> <int> <dbl>
# 1 40 200 2000 78 201.4864
# 2 40 200 2001 75 204.3171
# 3 50 200 2000 89 200.2998
# 4 50 200 2001 91 200.6544
# 5 60 200 2000 98 200.5048
# 6 60 200 2001 102 200.3770
# 7 80 200 2000 120 204.9951
# 8 80 200 2001 120 200.8252
首先生成一些示例数据:
doy <- rep(1:365,times=2)
year <- rep(2000:2001,each=365)
set.seed(1)
value <-runif(min=0,max=10,365*2)
doy.range <- c(40,50,60,80)
thres <- 200
df <- data.frame(cbind(doy,year,value))
我想做的是:
对于df$year == 2000
,从doy.range == 40
开始添加
df$value
,当df$value
的累加和>=thres
df$doy
这是我长期 for loop
实现的目标:
# create a matrix to store results
mat <- matrix(, nrow = length(doy.range)*length(unique(year)),ncol=3)
mat[,1] <- rep(unique(year),each=4)
mat[,2] <- rep(doy.range,times=2)
for(i in unique(df$year)){
dat <- df[df$year== i,]
for(j in doy.range){
dat1 <- dat[dat$doy >= j,]
dat1$cum.sum <-cumsum(dat1$value)
day.thres <- dat1[dat1$cum.sum >= thres,"doy"][1] # gives me the doy of the year where cumsum of df$value becomes >= thres
mat[mat[,2] == j & mat[,1] == i,3] <- day.thres
}
}
这个循环在我的矩阵的第三列中给出了 doy
当 cumsum$value
超过 thres
不过,我真的很想避免循环。有什么方法可以使用更少的代码来做到这一点吗?
如果我没理解错你可以使用dplyr
。假设阈值为 200:
library(dplyr)
df %>% group_by(year) %>%
filter(doy >= 40) %>%
mutate(CumSum = cumsum(value)) %>%
filter(CumSum >= 200) %>%
top_n(n = -1, wt = CumSum)
产生
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 78 2000 3.899895 201.4864
2 75 2001 9.205178 204.3171
我猜所使用的动词是不言自明的。如果没有,请告诉我。
对于不同的doy创建一个函数并使用lapply
:
f <- function(doy.range) {
df %>% group_by(year) %>%
filter(doy >= doy.range) %>%
mutate(CumSum = cumsum(value)) %>%
filter(CumSum >= 200) %>%
top_n(n = -1, wt = CumSum)
}
lapply(doy.range, f)
[[1]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 78 2000 3.899895 201.4864
2 75 2001 9.205178 204.3171
[[2]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 89 2000 2.454885 200.2998
2 91 2001 6.578281 200.6544
[[3]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 98 2000 4.100841 200.5048
2 102 2001 7.158333 200.3770
[[4]]
# A tibble: 2 x 4
# Groups: year [2]
doy year value CumSum
<dbl> <dbl> <dbl> <dbl>
1 120 2000 6.401010 204.9951
2 120 2001 5.884192 200.8252
我们的想法是创建一个函数,该函数基于给定的(起始)doy 和阈值获取相关信息。然后将此函数应用于起始 doys 和阈值的不同组合,并获得包含所有相关信息的数据集:
# create example data
doy <- rep(1:365,times=2)
year <- rep(2000:2001,each=365)
set.seed(1)
value <-runif(min=0,max=10,365*2)
df <- data.frame(doy,year,value)
library(dplyr)
library(purrr)
# function (inputs: dr for doy range and t for threshold)
f = function(dr, t) {
df %>%
filter(doy >= dr) %>% # keep rows with values aboven a given doy
group_by(year) %>% # for each year
mutate(CumSumValue = cumsum(value)) %>% # get the cumulative sum of value
filter(CumSumValue >= t) %>% # keep rows equal or above a given threshold
slice(1) %>% # keep the first row
ungroup() %>% # forget the grouping
select(-value) %>% # remove unnecessary variable
mutate(doy_input=dr, thres_input=t) %>% # add the input info as columns
select(doy_input, thres_input, year, doy, CumSumValue) # re arrange columns
}
# input doy and threshold
doy.range <- c(40,50,60,80)
thres <- 200
# map those vectors to the function
map2_df(doy.range, thres, f)
# # A tibble: 8 x 5
# doy_input thres_input year doy CumSumValue
# <dbl> <dbl> <int> <int> <dbl>
# 1 40 200 2000 78 201.4864
# 2 40 200 2001 75 204.3171
# 3 50 200 2000 89 200.2998
# 4 50 200 2001 91 200.6544
# 5 60 200 2000 98 200.5048
# 6 60 200 2001 102 200.3770
# 7 80 200 2000 120 204.9951
# 8 80 200 2001 120 200.8252