R 中有条件的 Cumsum
Cumsum with conditions in R
set.seed(123)
df <- data.frame(loc.id = rep(1:3,each = 3*5),
year = rep(rep(1981:1983, each = 5), times = 3),
week = rep(rep(20:24, times = 3), times = 3),
cumsum.val = runif(min = -2, max = 4, 5*3*3))
数据包含 3 locations
X 3 years
X 5 weeks
和一个名为 cumsum.val
的值。对于每个位置和年份,我想找到 cumsum.val > 1
所在的星期。然后,如果连续 cumsum.val > 1
的两周出现 select 第一周。一个例子
test <- df[df$loc.id == 1 & df$year == 1981,]
test$cumsum.test <- test$cumsum.val > 1 # weeks where cumsum.val > 1
head(test)
loc.id year week cumsum.val cumsum.test
1 1 1981 20 -0.2745349 FALSE
2 1 1981 21 2.7298308 TRUE
3 1 1981 22 0.4538615 FALSE
4 1 1981 23 3.2981044 TRUE
5 1 1981 24 3.6428037 TRUE
现在 select 连续两次出现 TRUE 的第一周,在上面的例子中是周 23
(因为周 23
和 24
都是 TRUE
).
如何为 df
实现此功能。可能是 cumusm.val > 1
没有连续出现两周的情况。在这种情况下,简单 select 第一周 cumsum.val > 1
set.seed(123)
df <- data.frame(loc.id = rep(1:3,each = 3*5),
year = rep(rep(1981:1983, each = 5), times = 3),
week = rep(rep(20:24, times = 3), times = 3),
cumsum.val = runif(min = -2, max = 4, 5*3*3))
View(df)
b <- unique(df$loc.id)
data <- data.frame()
for(i in seq_along(b)){
check=0
for(j in 1:length(df$loc.id)){
if(df$cumsum.val[j]>1 && df$loc.id[j]==b[i]){
check=check+1
}
else if(df$loc.id[j]==b[i]){
check=0
}
if(check>=2){
data1 <- data.frame(week1=df$week[j-1],idd=df$loc.id[j])
data <- rbind(data,data1)
}
}
}
一个基于dplyr
的解决方案可以解决问题。请注意 cumsum.test
已被计算为 numeric
,因此 lag
和 lead
的 default
值可以用于 0/1
.[=18 以外的值=]
df %>% mutate(cumsum.test = as.numeric(cumsum.val>1)) %>%
group_by(loc.id, year) %>%
mutate(SelctCond = ifelse(cumsum.test == 1 &
cumsum.test == lead(cumsum.test, default = -1L) &
cumsum.test != lag(cumsum.test, default = -1L), TRUE , FALSE )) %>%
filter(SelctCond) %>%
select(-SelctCond)
# # Groups: loc.id, year [6]
# loc.id year week cumsum.val cumsum.test
# <int> <int> <int> <dbl> <dbl>
# 1 1 1981 23 3.30 1.00
# 2 1 1982 21 1.17 1.00
# 3 1 1983 22 2.07 1.00
# 4 2 1982 20 3.34 1.00
# 5 2 1983 20 2.25 1.00
# 6 3 1981 20 3.78 1.00
一个data.table
方法:
require(data.table) # load package
setDT(df) # Convert to data.table
df[, cumsum.test := cumsum.val > 1] # create new variable
# Find consecutive values, check they are indeed cumsum.val > 1, and return the first row of them:
df[c(diff(cumsum.test), NA) == 0 & cumsum.test == TRUE, .SD[1, ]]
set.seed(123)
df <- data.frame(loc.id = rep(1:3,each = 3*5),
year = rep(rep(1981:1983, each = 5), times = 3),
week = rep(rep(20:24, times = 3), times = 3),
cumsum.val = runif(min = -2, max = 4, 5*3*3))
数据包含 3 locations
X 3 years
X 5 weeks
和一个名为 cumsum.val
的值。对于每个位置和年份,我想找到 cumsum.val > 1
所在的星期。然后,如果连续 cumsum.val > 1
的两周出现 select 第一周。一个例子
test <- df[df$loc.id == 1 & df$year == 1981,]
test$cumsum.test <- test$cumsum.val > 1 # weeks where cumsum.val > 1
head(test)
loc.id year week cumsum.val cumsum.test
1 1 1981 20 -0.2745349 FALSE
2 1 1981 21 2.7298308 TRUE
3 1 1981 22 0.4538615 FALSE
4 1 1981 23 3.2981044 TRUE
5 1 1981 24 3.6428037 TRUE
现在 select 连续两次出现 TRUE 的第一周,在上面的例子中是周 23
(因为周 23
和 24
都是 TRUE
).
如何为 df
实现此功能。可能是 cumusm.val > 1
没有连续出现两周的情况。在这种情况下,简单 select 第一周 cumsum.val > 1
set.seed(123)
df <- data.frame(loc.id = rep(1:3,each = 3*5),
year = rep(rep(1981:1983, each = 5), times = 3),
week = rep(rep(20:24, times = 3), times = 3),
cumsum.val = runif(min = -2, max = 4, 5*3*3))
View(df)
b <- unique(df$loc.id)
data <- data.frame()
for(i in seq_along(b)){
check=0
for(j in 1:length(df$loc.id)){
if(df$cumsum.val[j]>1 && df$loc.id[j]==b[i]){
check=check+1
}
else if(df$loc.id[j]==b[i]){
check=0
}
if(check>=2){
data1 <- data.frame(week1=df$week[j-1],idd=df$loc.id[j])
data <- rbind(data,data1)
}
}
}
一个基于dplyr
的解决方案可以解决问题。请注意 cumsum.test
已被计算为 numeric
,因此 lag
和 lead
的 default
值可以用于 0/1
.[=18 以外的值=]
df %>% mutate(cumsum.test = as.numeric(cumsum.val>1)) %>%
group_by(loc.id, year) %>%
mutate(SelctCond = ifelse(cumsum.test == 1 &
cumsum.test == lead(cumsum.test, default = -1L) &
cumsum.test != lag(cumsum.test, default = -1L), TRUE , FALSE )) %>%
filter(SelctCond) %>%
select(-SelctCond)
# # Groups: loc.id, year [6]
# loc.id year week cumsum.val cumsum.test
# <int> <int> <int> <dbl> <dbl>
# 1 1 1981 23 3.30 1.00
# 2 1 1982 21 1.17 1.00
# 3 1 1983 22 2.07 1.00
# 4 2 1982 20 3.34 1.00
# 5 2 1983 20 2.25 1.00
# 6 3 1981 20 3.78 1.00
一个data.table
方法:
require(data.table) # load package
setDT(df) # Convert to data.table
df[, cumsum.test := cumsum.val > 1] # create new variable
# Find consecutive values, check they are indeed cumsum.val > 1, and return the first row of them:
df[c(diff(cumsum.test), NA) == 0 & cumsum.test == TRUE, .SD[1, ]]