使用 accumulate 识别 R 中的连续重复项
Identify consecutive duplicates in R using accumulate
让我分享一个我正在尝试做的事情的例子,因为标题可能不像我希望的那样清晰。
data <- tibble(week=1:10,name=c(rep("Joe",10)),value=c(.9,.89,.99,.98,.87,.89,.93,.92,.98,.9),
wanted = c("Yes","Skip","No","No","Yes","Skip","Yes","Skip","No","Yes"))
data <- data %>% mutate(my_attempt = case_when( week-lag(week)==1 &
value < .95 &
lag(value) < .95 &
lag(value,2) >= .95 &
!is.na(lag(value,2))~ "Skip",
week-lag(week)==1 &
value < .95 &
lag(value) < .95 &
is.na(lag(value,2))~ "Skip",
value < .95 ~"Yes",
TRUE ~ "No"))
# week name value wanted my_attempt
# <int> <chr> <dbl> <chr> <chr>
# 1 Joe 0.9 Yes Yes
# 2 Joe 0.89 Skip Skip
# 3 Joe 0.99 No No
# 4 Joe 0.98 No No
# 5 Joe 0.87 Yes Yes
# 6 Joe 0.89 Skip Skip
# 7 Joe 0.93 Yes Yes
# 8 Joe 0.92 Skip Yes
# 9 Joe 0.98 No No
# 10 Joe 0.9 Yes Yes
我正在尝试获取 my_attempt 列以生成所需列的结果。我想在值小于某个阈值时识别行,但不能有两个连续的“是”值。我对它的尝试一直有效,直到它连续看到 4 个或更多低值。在我的真实数据中,可能会缺少几周,但这可以视为“否”。例如,如果缺少第 6 周,第 7 周仍然可以设置为“是”(我认为在我的案例中第一行会处理这个问题)。有没有办法在 R 中做到这一点?它不必与 dplyr 一致,但如果它可以在 tidyverse 中实现,那就太好了。
我想你可以在这里使用 purrr:accumulate()
library(purrr)
library(dplyr)
data%>%mutate(my_attempt = ifelse(week-lag(week, default = 0)==1 &
value < .95,
'Yes', 'No')%>%
accumulate(~ifelse(.x==.y & .y=='Yes', 'Skip', .y)))
# A tibble: 10 x 5
week name value wanted my_attempt
<int> <chr> <dbl> <chr> <chr>
1 1 Joe 0.9 Yes Yes
2 2 Joe 0.89 Skip Skip
3 3 Joe 0.99 No No
4 4 Joe 0.98 No No
5 5 Joe 0.87 Yes Yes
6 6 Joe 0.89 Skip Skip
7 7 Joe 0.93 Yes Yes
8 8 Joe 0.92 Skip Skip
9 9 Joe 0.98 No No
10 10 Joe 0.9 Yes Yes
这是一个简单的 dplyr
解决方案:
library(dplyr)
data %>%
mutate(grp = cummax(week - lag(week, default = 0))) %>%
group_by(name, grp) %>%
mutate(my_attempt = ifelse(value < 0.95 & lag(value, default = 1) < 0.95, "Skip",
ifelse(value < 0.95 & lag(value, default = 1) >= 0.95,
"Yes", "No")))
# A tibble: 9 x 6
# Groups: name, grp [2]
week name value wanted grp my_attempt
<int> <chr> <dbl> <chr> <dbl> <chr>
1 1 Joe 0.9 Yes 1 Yes
2 2 Joe 0.89 Skip 1 Skip
3 3 Joe 0.99 No 1 No
4 4 Joe 0.98 No 1 No
5 5 Joe 0.87 Yes 1 Yes
6 7 Joe 0.93 Yes 2 Yes
7 8 Joe 0.92 Skip 2 Skip
8 9 Joe 0.98 No 2 No
9 10 Joe 0.9 Yes 2 Yes
下面是如何使用 base::Reduce
对缺少周值的数据集执行此操作。我首先创建了一个分组 grp
基于周值之间的差异,然后 split
基于分组变量的数据集。之后我将我们的函数应用于每个块并将结果与 rbind
:
绑定
do.call(rbind, lapply(split(data, cummax(abs(data$week - c(0, data$week[-nrow(data)]))), data$name),
\(x){
x$my_attept <- Reduce(function(a, b) {
if(x$value[b] < 0.95 & a != "Yes") {
"Yes"
} else if(x$value[b] < 0.95 & a == "Yes") {
"Skip"
} else {
"No"
}
}, 2:nrow(x), init = ifelse(x$value[1] < 0.95, "Yes", "No"), accumulate = TRUE)
x
}))
# A tibble: 9 x 5
week name value wanted my_attept
* <int> <chr> <dbl> <chr> <chr>
1 1 Joe 0.9 Yes Yes
2 2 Joe 0.89 Skip Skip
3 3 Joe 0.99 No No
4 4 Joe 0.98 No No
5 5 Joe 0.87 Yes Yes
6 7 Joe 0.93 Yes Yes
7 8 Joe 0.92 Skip Skip
8 9 Joe 0.98 No No
9 10 Joe 0.9 Yes Yes
如果您的数据中缺少周数,例如此处修改后的数据集,您可以使用以下解决方案。我们首先根据周的连续值对周进行分组,然后将我们的解决方案应用于每一组:
data %>%
mutate(grp = cummax(week - lag(week, default = 0))) %>%
group_by(name, grp) %>%
mutate(my_attept = accumulate(value[-1], .init = ifelse(value[1] < 0.95, "Yes", "No"),
~ if(.y < 0.95 & .x != "Yes") {
"Yes"
} else if(.y < 0.95 & .x == "Yes") {
"Skip"
} else {
"No"
}))
# A tibble: 9 x 6
# Groups: grp [2]
week name value wanted grp my_attept
<int> <chr> <dbl> <chr> <dbl> <chr>
1 1 Joe 0.9 Yes 1 Yes
2 2 Joe 0.89 Skip 1 Skip
3 3 Joe 0.99 No 1 No
4 4 Joe 0.98 No 1 No
5 5 Joe 0.87 Yes 1 Yes
6 7 Joe 0.93 Yes 2 Yes
7 8 Joe 0.92 Skip 2 Skip
8 9 Joe 0.98 No 2 No
9 10 Joe 0.9 Yes 2 Yes
数据
structure(list(week = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L),
name = c("Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe",
"Joe", "Joe"), value = c(0.9, 0.89, 0.99, 0.98, 0.87, 0.93,
0.92, 0.98, 0.9), wanted = c("Yes", "Skip", "No", "No", "Yes",
"Yes", "Skip", "No", "Yes")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
我会用像 slider
这样的滚动计算库来完成它,其中丢失的数据可以很好地索引。向您展示修改后的数据
library(tidyverse)
data <- tibble(week=c(1:5, 7:10),name=c(rep("Joe",9)),value=c(.9,.89,.99,.98,.87,.93,.92,.98,.9),
wanted = c("Yes","Skip","No","No","Yes","Yes","Skip","No","Yes"))
data
#> # A tibble: 9 x 4
#> week name value wanted
#> <int> <chr> <dbl> <chr>
#> 1 1 Joe 0.9 Yes
#> 2 2 Joe 0.89 Skip
#> 3 3 Joe 0.99 No
#> 4 4 Joe 0.98 No
#> 5 5 Joe 0.87 Yes
#> 6 7 Joe 0.93 Yes
#> 7 8 Joe 0.92 Skip
#> 8 9 Joe 0.98 No
#> 9 10 Joe 0.9 Yes
library(slider)
data %>% group_by(name) %>%
mutate(wanted2 = case_when(value < 0.95 & slide_index_lgl(.x = value,
.i = week,
.f = ~ any(.x < 0.95),
.before = 1,
.after = -1) ~ 'skip',
value < 0.95 ~ 'yes',
TRUE ~ 'no'))
#> # A tibble: 9 x 5
#> # Groups: name [1]
#> week name value wanted wanted2
#> <int> <chr> <dbl> <chr> <chr>
#> 1 1 Joe 0.9 Yes yes
#> 2 2 Joe 0.89 Skip skip
#> 3 3 Joe 0.99 No no
#> 4 4 Joe 0.98 No no
#> 5 5 Joe 0.87 Yes yes
#> 6 7 Joe 0.93 Yes yes
#> 7 8 Joe 0.92 Skip skip
#> 8 9 Joe 0.98 No no
#> 9 10 Joe 0.9 Yes yes
甚至可以在不使用 slider
的情况下完成,即仅在 dplyr
中
library(dplyr)
data %>% group_by(name) %>%
mutate(wanted2 = case_when(value < 0.95 & lag(value, default = 1) < 0.95 & week - 1 == lag(week, default = 0) ~ 'Skip',
value < 0.95 ~ 'Yes',
TRUE ~ 'No'))
#> # A tibble: 9 x 5
#> # Groups: name [1]
#> week name value wanted wanted2
#> <int> <chr> <dbl> <chr> <chr>
#> 1 1 Joe 0.9 Yes Yes
#> 2 2 Joe 0.89 Skip Skip
#> 3 3 Joe 0.99 No No
#> 4 4 Joe 0.98 No No
#> 5 5 Joe 0.87 Yes Yes
#> 6 7 Joe 0.93 Yes Yes
#> 7 8 Joe 0.92 Skip Skip
#> 8 9 Joe 0.98 No No
#> 9 10 Joe 0.9 Yes Yes
由 reprex package (v2.0.0)
于 2021-07-25 创建
让我分享一个我正在尝试做的事情的例子,因为标题可能不像我希望的那样清晰。
data <- tibble(week=1:10,name=c(rep("Joe",10)),value=c(.9,.89,.99,.98,.87,.89,.93,.92,.98,.9),
wanted = c("Yes","Skip","No","No","Yes","Skip","Yes","Skip","No","Yes"))
data <- data %>% mutate(my_attempt = case_when( week-lag(week)==1 &
value < .95 &
lag(value) < .95 &
lag(value,2) >= .95 &
!is.na(lag(value,2))~ "Skip",
week-lag(week)==1 &
value < .95 &
lag(value) < .95 &
is.na(lag(value,2))~ "Skip",
value < .95 ~"Yes",
TRUE ~ "No"))
# week name value wanted my_attempt
# <int> <chr> <dbl> <chr> <chr>
# 1 Joe 0.9 Yes Yes
# 2 Joe 0.89 Skip Skip
# 3 Joe 0.99 No No
# 4 Joe 0.98 No No
# 5 Joe 0.87 Yes Yes
# 6 Joe 0.89 Skip Skip
# 7 Joe 0.93 Yes Yes
# 8 Joe 0.92 Skip Yes
# 9 Joe 0.98 No No
# 10 Joe 0.9 Yes Yes
我正在尝试获取 my_attempt 列以生成所需列的结果。我想在值小于某个阈值时识别行,但不能有两个连续的“是”值。我对它的尝试一直有效,直到它连续看到 4 个或更多低值。在我的真实数据中,可能会缺少几周,但这可以视为“否”。例如,如果缺少第 6 周,第 7 周仍然可以设置为“是”(我认为在我的案例中第一行会处理这个问题)。有没有办法在 R 中做到这一点?它不必与 dplyr 一致,但如果它可以在 tidyverse 中实现,那就太好了。
我想你可以在这里使用 purrr:accumulate()
library(purrr)
library(dplyr)
data%>%mutate(my_attempt = ifelse(week-lag(week, default = 0)==1 &
value < .95,
'Yes', 'No')%>%
accumulate(~ifelse(.x==.y & .y=='Yes', 'Skip', .y)))
# A tibble: 10 x 5
week name value wanted my_attempt
<int> <chr> <dbl> <chr> <chr>
1 1 Joe 0.9 Yes Yes
2 2 Joe 0.89 Skip Skip
3 3 Joe 0.99 No No
4 4 Joe 0.98 No No
5 5 Joe 0.87 Yes Yes
6 6 Joe 0.89 Skip Skip
7 7 Joe 0.93 Yes Yes
8 8 Joe 0.92 Skip Skip
9 9 Joe 0.98 No No
10 10 Joe 0.9 Yes Yes
这是一个简单的 dplyr
解决方案:
library(dplyr)
data %>%
mutate(grp = cummax(week - lag(week, default = 0))) %>%
group_by(name, grp) %>%
mutate(my_attempt = ifelse(value < 0.95 & lag(value, default = 1) < 0.95, "Skip",
ifelse(value < 0.95 & lag(value, default = 1) >= 0.95,
"Yes", "No")))
# A tibble: 9 x 6
# Groups: name, grp [2]
week name value wanted grp my_attempt
<int> <chr> <dbl> <chr> <dbl> <chr>
1 1 Joe 0.9 Yes 1 Yes
2 2 Joe 0.89 Skip 1 Skip
3 3 Joe 0.99 No 1 No
4 4 Joe 0.98 No 1 No
5 5 Joe 0.87 Yes 1 Yes
6 7 Joe 0.93 Yes 2 Yes
7 8 Joe 0.92 Skip 2 Skip
8 9 Joe 0.98 No 2 No
9 10 Joe 0.9 Yes 2 Yes
下面是如何使用 base::Reduce
对缺少周值的数据集执行此操作。我首先创建了一个分组 grp
基于周值之间的差异,然后 split
基于分组变量的数据集。之后我将我们的函数应用于每个块并将结果与 rbind
:
do.call(rbind, lapply(split(data, cummax(abs(data$week - c(0, data$week[-nrow(data)]))), data$name),
\(x){
x$my_attept <- Reduce(function(a, b) {
if(x$value[b] < 0.95 & a != "Yes") {
"Yes"
} else if(x$value[b] < 0.95 & a == "Yes") {
"Skip"
} else {
"No"
}
}, 2:nrow(x), init = ifelse(x$value[1] < 0.95, "Yes", "No"), accumulate = TRUE)
x
}))
# A tibble: 9 x 5
week name value wanted my_attept
* <int> <chr> <dbl> <chr> <chr>
1 1 Joe 0.9 Yes Yes
2 2 Joe 0.89 Skip Skip
3 3 Joe 0.99 No No
4 4 Joe 0.98 No No
5 5 Joe 0.87 Yes Yes
6 7 Joe 0.93 Yes Yes
7 8 Joe 0.92 Skip Skip
8 9 Joe 0.98 No No
9 10 Joe 0.9 Yes Yes
如果您的数据中缺少周数,例如此处修改后的数据集,您可以使用以下解决方案。我们首先根据周的连续值对周进行分组,然后将我们的解决方案应用于每一组:
data %>%
mutate(grp = cummax(week - lag(week, default = 0))) %>%
group_by(name, grp) %>%
mutate(my_attept = accumulate(value[-1], .init = ifelse(value[1] < 0.95, "Yes", "No"),
~ if(.y < 0.95 & .x != "Yes") {
"Yes"
} else if(.y < 0.95 & .x == "Yes") {
"Skip"
} else {
"No"
}))
# A tibble: 9 x 6
# Groups: grp [2]
week name value wanted grp my_attept
<int> <chr> <dbl> <chr> <dbl> <chr>
1 1 Joe 0.9 Yes 1 Yes
2 2 Joe 0.89 Skip 1 Skip
3 3 Joe 0.99 No 1 No
4 4 Joe 0.98 No 1 No
5 5 Joe 0.87 Yes 1 Yes
6 7 Joe 0.93 Yes 2 Yes
7 8 Joe 0.92 Skip 2 Skip
8 9 Joe 0.98 No 2 No
9 10 Joe 0.9 Yes 2 Yes
数据
structure(list(week = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L),
name = c("Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe",
"Joe", "Joe"), value = c(0.9, 0.89, 0.99, 0.98, 0.87, 0.93,
0.92, 0.98, 0.9), wanted = c("Yes", "Skip", "No", "No", "Yes",
"Yes", "Skip", "No", "Yes")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
我会用像 slider
这样的滚动计算库来完成它,其中丢失的数据可以很好地索引。向您展示修改后的数据
library(tidyverse)
data <- tibble(week=c(1:5, 7:10),name=c(rep("Joe",9)),value=c(.9,.89,.99,.98,.87,.93,.92,.98,.9),
wanted = c("Yes","Skip","No","No","Yes","Yes","Skip","No","Yes"))
data
#> # A tibble: 9 x 4
#> week name value wanted
#> <int> <chr> <dbl> <chr>
#> 1 1 Joe 0.9 Yes
#> 2 2 Joe 0.89 Skip
#> 3 3 Joe 0.99 No
#> 4 4 Joe 0.98 No
#> 5 5 Joe 0.87 Yes
#> 6 7 Joe 0.93 Yes
#> 7 8 Joe 0.92 Skip
#> 8 9 Joe 0.98 No
#> 9 10 Joe 0.9 Yes
library(slider)
data %>% group_by(name) %>%
mutate(wanted2 = case_when(value < 0.95 & slide_index_lgl(.x = value,
.i = week,
.f = ~ any(.x < 0.95),
.before = 1,
.after = -1) ~ 'skip',
value < 0.95 ~ 'yes',
TRUE ~ 'no'))
#> # A tibble: 9 x 5
#> # Groups: name [1]
#> week name value wanted wanted2
#> <int> <chr> <dbl> <chr> <chr>
#> 1 1 Joe 0.9 Yes yes
#> 2 2 Joe 0.89 Skip skip
#> 3 3 Joe 0.99 No no
#> 4 4 Joe 0.98 No no
#> 5 5 Joe 0.87 Yes yes
#> 6 7 Joe 0.93 Yes yes
#> 7 8 Joe 0.92 Skip skip
#> 8 9 Joe 0.98 No no
#> 9 10 Joe 0.9 Yes yes
甚至可以在不使用 slider
的情况下完成,即仅在 dplyr
中
library(dplyr)
data %>% group_by(name) %>%
mutate(wanted2 = case_when(value < 0.95 & lag(value, default = 1) < 0.95 & week - 1 == lag(week, default = 0) ~ 'Skip',
value < 0.95 ~ 'Yes',
TRUE ~ 'No'))
#> # A tibble: 9 x 5
#> # Groups: name [1]
#> week name value wanted wanted2
#> <int> <chr> <dbl> <chr> <chr>
#> 1 1 Joe 0.9 Yes Yes
#> 2 2 Joe 0.89 Skip Skip
#> 3 3 Joe 0.99 No No
#> 4 4 Joe 0.98 No No
#> 5 5 Joe 0.87 Yes Yes
#> 6 7 Joe 0.93 Yes Yes
#> 7 8 Joe 0.92 Skip Skip
#> 8 9 Joe 0.98 No No
#> 9 10 Joe 0.9 Yes Yes
由 reprex package (v2.0.0)
于 2021-07-25 创建