使用 R (dplyr) 在过滤器中实现 "at least" 条件
Implementing "at least" condition in a filter using R (dplyr)
这个问题与我之前的post有关:
这是数据:
dat <- structure(list(V1 = c(-3.85326, -2.88262, -4.1405, -3.95193,
-6.68925, -2.04202, -2.47597, -4.91161, -2.5946, -2.82873, 2.68839,
-4.1287, -4.50296, -0.143476, -1.12174, -0.756168, -1.67556,
-1.92704, -1.89279, -2.37569, -5.71746, -2.7247, -4.12986, -2.29769,
-1.52835, -2.63623, -2.31461, 2.32796, 4.14354, 4.47055, -0.557311,
-0.425266, -2.37455, -5.97684, -5.22391, 0.374004, -0.986549,
2.36419, 0.218283, 2.66014, -3.44225, 3.46593, 1.3309, 0.679601,
5.42195, 10.6555, 8.34144, 1.64939, -1.64558, -0.754001, -4.77503,
-6.66197, -4.07188, -1.72996, -1.15338, -8.05588, -6.58208, 1.32375,
-3.69241, -5.23582, -4.33509, -7.43028, -3.57103, -10.4991, -8.68752,
-8.98304, -8.96825, -7.99087, -8.25109, -6.48483, -6.09004, -7.05249,
-4.78267)), class = "data.frame", row.names = c(NA, -73L))
我想要的
我想获得满足以下修改条件的第一个时间步长:
[1] V1 > 0 at the time step
[2] In the succeeding FOUR time steps (including the timestep in [1]), V1 > 0 in AT LEAST THREE timesteps
[3] Accumulated value of the next FOUR timesteps (including the timestep in [1]) should be greater than 1.
目前的脚本如下:
library(dplyr)
newx <- dat %>% as_tibble() %>%
mutate(time = 1: n()) %>%
filter(V1 > 0, dplyr::lead(V1, 1) > 0, dplyr::lead(V1, 2) > 0,
(dplyr::lead(V1, 1) + dplyr::lead(V1, 2) + dplyr::lead(V1, 3) +
dplyr::lead(V1, 4)) > 1)
输出
> newx
# A tibble: 7 x 2
V1 time
<dbl> <int>
1 2.33 28
2 2.36 38
3 3.47 42
4 1.33 43
5 0.680 44
6 5.42 45
7 10.7 46
问题
我不知道如何正确实现第二个条件。它应该检查四个时间步长中的三个是否 > 0。是否连续并不重要。
预期输出
正确答案应该是 28。
我会很感激任何帮助。
如果我没理解错,你想要第一行符合你的条件,你可以使用 zoo::rollsum
:
library(zoo)
library(dplyr)
dat %>%
rownames_to_column() %>%
filter(V1 > 0 &
rollsum(V1 > 0, 4, fill = NA, align = "left") >= 3 &
rollsum(V1, 4, fill = NA, align = "left") > 1) %>%
slice(1)
rowname V1
1 28 2.32796
使用stats::filter
进行滚动求和:
which(
(dat$V1 > 0) &
(rev(stats::filter(rev(dat$V1 > 0), rep(1,4), sides=1)) >= 3) &
rev(stats::filter(rev(dat$V1), rep(1,4), sides=1))
)[1]
#[1] 28
或者如果你必须合并到 dplyr
:
dat %>%
slice(
which(
(rev(stats::filter(rev(V1 > 0), rep(1,4), sides=1)) >= 3) &
(V1 > 0) &
rev(stats::filter(rev(V1), rep(1,4), sides=1))
)[1]
)
## A tibble: 1 x 1
# V1
# <dbl>
#1 2.33
比较多:
library(dplyr)
dat2 <- dat %>%
tibble::rowid_to_column() %>%
mutate(gtz = (V1 > 0) * 1,
gtz_cuml = cumsum(gtz),
gtz_next_three = lead(gtz_cuml, 3) - lag(gtz_cuml),
cuml_V1 = cumsum(V1),
V1_next_three = lead(cuml_V1, 3) - lag(cuml_V1)) %>%
filter(gtz > 0,
gtz_next_three >= 3,
V1_next_three > 1) %>%
slice(1)
#> dat2
# rowid V1 gtz gtz_cuml gtz_next_three cuml_V1 V1_next_three
#1 28 2.32796 1 2 3 -71.22716 9.959473
这个问题与我之前的post有关:
这是数据:
dat <- structure(list(V1 = c(-3.85326, -2.88262, -4.1405, -3.95193,
-6.68925, -2.04202, -2.47597, -4.91161, -2.5946, -2.82873, 2.68839,
-4.1287, -4.50296, -0.143476, -1.12174, -0.756168, -1.67556,
-1.92704, -1.89279, -2.37569, -5.71746, -2.7247, -4.12986, -2.29769,
-1.52835, -2.63623, -2.31461, 2.32796, 4.14354, 4.47055, -0.557311,
-0.425266, -2.37455, -5.97684, -5.22391, 0.374004, -0.986549,
2.36419, 0.218283, 2.66014, -3.44225, 3.46593, 1.3309, 0.679601,
5.42195, 10.6555, 8.34144, 1.64939, -1.64558, -0.754001, -4.77503,
-6.66197, -4.07188, -1.72996, -1.15338, -8.05588, -6.58208, 1.32375,
-3.69241, -5.23582, -4.33509, -7.43028, -3.57103, -10.4991, -8.68752,
-8.98304, -8.96825, -7.99087, -8.25109, -6.48483, -6.09004, -7.05249,
-4.78267)), class = "data.frame", row.names = c(NA, -73L))
我想要的
我想获得满足以下修改条件的第一个时间步长:
[1] V1 > 0 at the time step
[2] In the succeeding FOUR time steps (including the timestep in [1]), V1 > 0 in AT LEAST THREE timesteps
[3] Accumulated value of the next FOUR timesteps (including the timestep in [1]) should be greater than 1.
目前的脚本如下:
library(dplyr)
newx <- dat %>% as_tibble() %>%
mutate(time = 1: n()) %>%
filter(V1 > 0, dplyr::lead(V1, 1) > 0, dplyr::lead(V1, 2) > 0,
(dplyr::lead(V1, 1) + dplyr::lead(V1, 2) + dplyr::lead(V1, 3) +
dplyr::lead(V1, 4)) > 1)
输出
> newx
# A tibble: 7 x 2
V1 time
<dbl> <int>
1 2.33 28
2 2.36 38
3 3.47 42
4 1.33 43
5 0.680 44
6 5.42 45
7 10.7 46
问题
我不知道如何正确实现第二个条件。它应该检查四个时间步长中的三个是否 > 0。是否连续并不重要。
预期输出
正确答案应该是 28。
我会很感激任何帮助。
如果我没理解错,你想要第一行符合你的条件,你可以使用 zoo::rollsum
:
library(zoo)
library(dplyr)
dat %>%
rownames_to_column() %>%
filter(V1 > 0 &
rollsum(V1 > 0, 4, fill = NA, align = "left") >= 3 &
rollsum(V1, 4, fill = NA, align = "left") > 1) %>%
slice(1)
rowname V1
1 28 2.32796
使用stats::filter
进行滚动求和:
which(
(dat$V1 > 0) &
(rev(stats::filter(rev(dat$V1 > 0), rep(1,4), sides=1)) >= 3) &
rev(stats::filter(rev(dat$V1), rep(1,4), sides=1))
)[1]
#[1] 28
或者如果你必须合并到 dplyr
:
dat %>%
slice(
which(
(rev(stats::filter(rev(V1 > 0), rep(1,4), sides=1)) >= 3) &
(V1 > 0) &
rev(stats::filter(rev(V1), rep(1,4), sides=1))
)[1]
)
## A tibble: 1 x 1
# V1
# <dbl>
#1 2.33
比较多:
library(dplyr)
dat2 <- dat %>%
tibble::rowid_to_column() %>%
mutate(gtz = (V1 > 0) * 1,
gtz_cuml = cumsum(gtz),
gtz_next_three = lead(gtz_cuml, 3) - lag(gtz_cuml),
cuml_V1 = cumsum(V1),
V1_next_three = lead(cuml_V1, 3) - lag(cuml_V1)) %>%
filter(gtz > 0,
gtz_next_three >= 3,
V1_next_three > 1) %>%
slice(1)
#> dat2
# rowid V1 gtz gtz_cuml gtz_next_three cuml_V1 V1_next_three
#1 28 2.32796 1 2 3 -71.22716 9.959473