为面板中的组创建以事件和事件之前的零开始的顺序计数器

Create sequential counter starting with event and zeros before event for groups in panel

对于面板数据集 (GSOEP),我需要创建一个时间计数器,在事件发生后为我提供 delta t,该事件对于每个人在特定年份的虚拟编码为 1。例如。有一个人在随机年份范围内的观察结果,例如 1990-2006,有一个单独的变量表示年份中的某个事件为 1,例如1996. 计数器需要在下一年开始,应该以下一个人 (id) 结束,并且需要在该人的事件发生之前为零。

目前的数据是这样的:

df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)

   id year event
1   1 1998     0
2   1 1999     0
3   1 2000     1
4   1 2001     0
5   1 2002     0
6   1 2003     0
7   2 1998     0
8   2 1999     0
9   2 2000     0
10  2 2001     0
11  2 2002     1
12  2 2003     0
13  3 1998     0
14  3 1999     1
15  3 2000     0
16  3 2001     0
17  3 2002     0
18  3 2003     0

需要的是这个:

df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0),delta=c(0,0,0,1,2,3,0,0,0,0,0,1,0,0,1,2,3,4), stringsAsFactors=FALSE)

   id year event delta
1   1 1998     0     0
2   1 1999     0     0
3   1 2000     1     0
4   1 2001     0     1
5   1 2002     0     2
6   1 2003     0     3
7   2 1998     0     0
8   2 1999     0     0
9   2 2000     0     0
10  2 2001     0     0
11  2 2002     1     0
12  2 2003     0     1
13  3 1998     0     0
14  3 1999     1     0
15  3 2000     0     1
16  3 2001     0     2
17  3 2002     0     3
18  3 2003     0     4

我怎样才能做到这一点?我得到的最接近的是这里:

但我不知道如何修改它,让它只在事件发生一次后开始,并在事件前置零。还有一些个人没有事件,计数器需要给出零。每个人的年数(观察)是不同的,因此一些 id 的范围是 1984-1999,而其他 id 的范围是 1995-2015。

你会帮我很大的忙,我想提前感谢你的时间和努力。

此致,

朱利叶斯

可能不是最优雅的版本,但如果您的数据集不是太大,以下几行可能是一个开始。

library(data.table)
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
DT <- as.data.table(df)

get_delta <- function(x) {
  if (all(x == 0)) {
    return(x)
  } else {
    event_position <- which(x == 1)
    x[event_position] <- 0
    if (event_position == length(x)) {
     return(x) 
    } else {
     x[(event_position+1):length(x)] <- seq(length(x)-event_position)
     return(x)
    }
  }
}


DT[, delta:= get_delta(event), by = c("id")]
DT
# id year event delta
# 1:  1 1998     0     0
# 2:  1 1999     0     0
# 3:  1 2000     1     0
# 4:  1 2001     0     1
# 5:  1 2002     0     2
# 6:  1 2003     0     3
# 7:  2 1998     0     0
# 8:  2 1999     0     0
# 9:  2 2000     0     0
# 10:  2 2001     0     0
# 11:  2 2002     1     0
# 12:  2 2003     0     1
# 13:  3 1998     0     0
# 14:  3 1999     1     0
# 15:  3 2000     0     1
# 16:  3 2001     0     2
# 17:  3 2002     0     3
# 18:  3 2003     0     4

n_rows <- 1e6
DT_large <- data.table(id= as.character(rep(c(1:n_rows), each=6))
                       ,year=rep(1998:2003, n_rows), 
                       event = as.vector(sapply(1:n_rows, function(x) {
                         x <- rep(0, 6)
                         x[sample(6, 1)] <- 1  
                         x
                       }))
                       ,stringsAsFactors=FALSE)

system.time(DT_large[, delta:= get_delta(event), by = c("id")])
# User      System     elapsed 
# 9.30        0.02        9.35

#some benchmarking...
library(tidyverse)
library(data.table)
library(microbenchmark)

df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)

CPak_approach <- function() {
  df %>%
    group_by(id) %>%
    mutate(delta = ifelse(cumsum(cummax(event)) > 0, cumsum(cummax(event)) - 1, 0)) %>%
    ungroup()  
}

manuelbickel_approach <- function(x) {
  DT <- as.data.table(df)
  get_delta <- function(x) {
    if (all(x == 0)) {
      return(x)
    } else {
      event_position <- which(x == 1)
      x[event_position] <- 0
      if (event_position == length(x)) {
        return(x) 
      } else {
        x[(event_position+1):length(x)] <- seq(length(x)-event_position)
        return(x)
      }
    }
  }
  DT[, delta:= get_delta(event), by = c("id")]
}


microbenchmark(
  (dplyr_approach()),
  (manuelbickel_approach())
)

# Unit: microseconds
#       expr                      min        lq     mean   median       uq       max neval
# (dplyr_approach())         3731.146 3872.6625 4098.923 3985.363 4194.183  6441.475   100
# (manuelbickel_approach())   803.705  829.5605 1148.891 1014.105 1049.829 13993.372   100

您可以使用 group_by(id)cumsum(cummax(event)) 来接近 - 从 event==1 开始生成 1...N。我将其包装在 ifelse(...) 中,以便从 > 0.

的值中减去 1
library(tidyverse)
df %>%
  group_by(id) %>%
  mutate(delta = ifelse(cumsum(cummax(event)) > 0, cumsum(cummax(event)) - 1, 0)) %>%
  ungroup()

# A tibble: 18 x 4
   # id     year event delta
   # <chr> <int> <dbl> <dbl>
 # 1 1      1998    0.    0.
 # 2 1      1999    0.    0.
 # 3 1      2000    1.    0.
 # 4 1      2001    0.    1.
 # 5 1      2002    0.    2.
 # 6 1      2003    0.    3.
 # 7 2      1998    0.    0.
 # 8 2      1999    0.    0.
 # 9 2      2000    0.    0.
# 10 2      2001    0.    0.
# 11 2      2002    1.    0.
# 12 2      2003    0.    1.
# 13 3      1998    0.    0.
# 14 3      1999    1.    0.
# 15 3      2000    0.    1.
# 16 3      2001    0.    2.
# 17 3      2002    0.    3.
# 18 3      2003    0.    4.