dplyr::lead 或 data.table::shift 引用变量值而不是标量

dplyr::lead or data.table::shift refer to variable value rather than scalar

给定:

library(tidyverse)
df <- data.frame(id = c(1, 1, 1, 1, 1,
                        rep(2, 5), rep(3, 3)),
                 dates = as.Date(c("2015-01-01",
                                   "2015-01-02",
                                   "2015-01-02",
                                   "2015-01-03",
                                   "2015-01-04",
                                   "2015-02-22",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-25",
                                   "2015-03-13",
                                   "2015-03-14",
                                   "2015-03-15")),
                 indicator = c(0, 1, 0, 0, 0,
                               0, 1, 0, 0, 0,
                               0, 1, 0),
                 final_date = as.Date(rep(NA, 13))) %>% 
  group_by(id, dates) %>% 
  mutate(repeat_days = n())
df
#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 NA                   1
#  2     1 2015-01-02         1 NA                   2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         0 NA                   1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 NA                   3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 NA                   1
# 13     3 2015-03-15         0 NA                   1

基于条件 (indicator == 1),我想通过变量 (repeat_days) 中的值 lead dates 而不是提供缩放器值,所以我想要的输出看起来像:

#df_final
#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 NA                   1
#  2     1 2015-01-02         1 2015-01-03           2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         0 NA                   1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 2015-02-25           3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 2015-03-15           1
# 13     3 2015-03-15         0 NA                   1

如果我们想 lead 一个标量,例如1,这个有效:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, n = 1), TRUE ~ final_date)) 

但是当我提供一个变量时,它不会按预期工作,因为它不是标量:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, repeat_days), TRUE ~ final_date)) 
# Error: Problem with `mutate()` column `final_date`.
# i `final_date = case_when(...)`.
# x `n` must be a nonnegative integer scalar, not an integer vector of length 5.
# i The error occurred in group 1: id = 1.

这也不起作用,因为它指的是 repeat_days 在所有这些情况下都是 1 的组的第一次出现:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, repeat_days[1]), TRUE ~ final_date))

有没有办法直接引用 repeat_days 的行级别值而不创建额外的变量?

谢谢


编辑 感谢@Maël 很好的回答:

df %>% 
  group_by(id) %>% 
  mutate(final_date = case_when(is.na(final_date) & indicator == 1 ~ 
                                  lead(dates, repeat_days[indicator == 1]), 
                                TRUE ~ final_date))

我应该明确表示我也可以让每个组重复 indicator == 1 所以它也需要在这个数据集上工作:

df <- data.frame(id = c(1, 1, 1, 1, 1,
                        rep(2, 5), rep(3, 3), 4, 4),
                 dates = as.Date(c("2015-01-01",
                                   "2015-01-02",
                                   "2015-01-02",
                                   "2015-01-03",
                                   "2015-01-04",
                                   "2015-02-22",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-23",
                                   "2015-02-25",
                                   "2015-03-13",
                                   "2015-03-14",
                                   "2015-03-15",
                                   "2015-04-15",
                                   "2015-04-16")),
                 indicator = c(0, 1, 0, 1, 0,
                               0, 1, 0, 0, 0,
                               0, 1, 0, 0, 1),
                 final_date = as.Date(c("2015-01-01", rep(NA, 14)))) %>% 
  group_by(id, dates) %>% 
  mutate(repeat_days = n()) %>% 
  ungroup()
df
#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 2015-01-01           1
#  2     1 2015-01-02         1 NA                   2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         1 NA                   1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 NA                   3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 NA                   1
# 13     3 2015-03-15         0 NA                   1
# 14     4 2015-04-15         0 NA                   1
# 15     4 2015-04-16         1 NA                   1

注意 id == 4,没有提前日期,所以在这种情况下我希望它默认为他们当前的行。此外,第一行现在已经有一个 final_date 值,因此需要使用 case_when 或类似的东西。

期望的输出:

#       id dates      indicator final_date repeat_days
#    <dbl> <date>         <dbl> <date>           <int>
#  1     1 2015-01-01         0 2015-01-01           1
#  2     1 2015-01-02         1 2015-01-03           2
#  3     1 2015-01-02         0 NA                   2
#  4     1 2015-01-03         1 2015-01-04           1
#  5     1 2015-01-04         0 NA                   1
#  6     2 2015-02-22         0 NA                   1
#  7     2 2015-02-23         1 2015-02-25           3
#  8     2 2015-02-23         0 NA                   3
#  9     2 2015-02-23         0 NA                   3
# 10     2 2015-02-25         0 NA                   1
# 11     3 2015-03-13         0 NA                   1
# 12     3 2015-03-14         1 2015-03-15           1
# 13     3 2015-03-15         0 NA                   1
# 14     4 2015-04-15         0 NA                   1
# 15     4 2015-04-16         1 2015-04-16           1

相关链接, and 但我无法在这个有条件的特殊情况下实现类似的东西。也很高兴看到 data.table (shift?) 解决方案。

我想出了一个解决方案,它使用来自基础 R 的 sapply()

library(dplyr)

df %>% 
  ungroup() %>% 
  mutate(final_date = as.Date(sapply(1:nrow(df), function(x) 
    ifelse(is.na(df$final_date[x]), 
           ifelse(df$indicator[x] == 1, 
                  ifelse(is.na(df$id[x] == df$id[x + df$repeat_days[x]]),
                         format(as.Date(df$dates[x], origin = "2020-01-01")),
                         ifelse(df$id[x] == df$id[x + df$repeat_days[x]],
                                format(as.Date(df$dates[x + df$repeat_days[x]], origin = "2020-01-01")), 
                                NA)), 
                  NA), 
           as.character(df$final_date[x])))))

# A tibble: 15 × 5
      id dates      indicator final_date repeat_days
   <dbl> <date>         <dbl> <date>           <int>
 1     1 2015-01-01         0 2015-01-01           1
 2     1 2015-01-02         1 2015-01-03           2
 3     1 2015-01-02         0 NA                   2
 4     1 2015-01-03         1 2015-01-04           1
 5     1 2015-01-04         0 NA                   1
 6     2 2015-02-22         0 NA                   1
 7     2 2015-02-23         1 2015-02-25           3
 8     2 2015-02-23         0 NA                   3
 9     2 2015-02-23         0 NA                   3
10     2 2015-02-25         0 NA                   1
11     3 2015-03-13         0 NA                   1
12     3 2015-03-14         1 2015-03-15           1
13     3 2015-03-15         0 NA                   1
14     4 2015-04-15         0 NA                   1
15     4 2015-04-16         1 2015-04-16           1

编写一个接受 n 向量的 lead 函数可能会更简单。下面我称这个函数为lead2。其余代码保持不变。

更新:您进一步说明,如果indicator = 1但没有提前日期,则final_date应填写当前日期。这可以用 dplyr::coalesce 来实现,它会找到向量中的第一个 non-null 元素。它类似于 SQL COALESCE 运算符。

library("tidyverse")

df <- data.frame(
  id = c(
    1, 1, 1, 1, 1,
    rep(2, 5), rep(3, 3), 4, 4
  ),
  dates = as.Date(c(
    "2015-01-01",
    "2015-01-02",
    "2015-01-02",
    "2015-01-03",
    "2015-01-04",
    "2015-02-22",
    "2015-02-23",
    "2015-02-23",
    "2015-02-23",
    "2015-02-25",
    "2015-03-13",
    "2015-03-14",
    "2015-03-15",
    "2015-04-15",
    "2015-04-16"
  )),
  indicator = c(
    0, 1, 0, 1, 0,
    0, 1, 0, 0, 0,
    0, 1, 0, 0, 1
  ),
  final_date = as.Date(c("2015-01-01", rep(NA, 14)))
) %>%
  group_by(id, dates) %>%
  mutate(repeat_days = n()) %>%
  ungroup()

lead2 <- function(x, ns) {
  # x: vector of values
  # ns: vector of leads

  # Compute the target position for each element
  is <- seq_along(x) + ns
  x[is]
}

xs <- c("a", "b", "c", "d", "e", "f")
ns <- c(1, 1, 2, 3, 1, 2)
lead2(xs, ns)
#> [1] "b" "c" "e" NA  "f" NA

df %>%
  group_by(id) %>%
  mutate(
    final_date = if_else(
      is.na(final_date) & indicator == 1,
        coalesce(lead2(dates, repeat_days), dates),
        final_date
    )
  )
#> # A tibble: 15 × 5
#> # Groups:   id [4]
#>       id dates      indicator final_date repeat_days
#>    <dbl> <date>         <dbl> <date>           <int>
#>  1     1 2015-01-01         0 2015-01-01           1
#>  2     1 2015-01-02         1 2015-01-03           2
#>  3     1 2015-01-02         0 NA                   2
#>  4     1 2015-01-03         1 2015-01-04           1
#>  5     1 2015-01-04         0 NA                   1
#>  6     2 2015-02-22         0 NA                   1
#>  7     2 2015-02-23         1 2015-02-25           3
#>  8     2 2015-02-23         0 NA                   3
#>  9     2 2015-02-23         0 NA                   3
#> 10     2 2015-02-25         0 NA                   1
#> 11     3 2015-03-13         0 NA                   1
#> 12     3 2015-03-14         1 2015-03-15           1
#> 13     3 2015-03-15         0 NA                   1
#> 14     4 2015-04-15         0 NA                   1
#> 15     4 2015-04-16         1 2015-04-16           1

reprex package (v2.0.1)

于 2022 年 3 月 14 日创建