为面板中的组创建以事件和事件之前的零开始的顺序计数器
Create sequential counter starting with event and zeros before event for groups in panel
对于面板数据集 (GSOEP),我需要创建一个时间计数器,在事件发生后为我提供 delta t,该事件对于每个人在特定年份的虚拟编码为 1。例如。有一个人在随机年份范围内的观察结果,例如 1990-2006,有一个单独的变量表示年份中的某个事件为 1,例如1996. 计数器需要在下一年开始,应该以下一个人 (id) 结束,并且需要在该人的事件发生之前为零。
目前的数据是这样的:
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
id year event
1 1 1998 0
2 1 1999 0
3 1 2000 1
4 1 2001 0
5 1 2002 0
6 1 2003 0
7 2 1998 0
8 2 1999 0
9 2 2000 0
10 2 2001 0
11 2 2002 1
12 2 2003 0
13 3 1998 0
14 3 1999 1
15 3 2000 0
16 3 2001 0
17 3 2002 0
18 3 2003 0
需要的是这个:
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0),delta=c(0,0,0,1,2,3,0,0,0,0,0,1,0,0,1,2,3,4), stringsAsFactors=FALSE)
id year event delta
1 1 1998 0 0
2 1 1999 0 0
3 1 2000 1 0
4 1 2001 0 1
5 1 2002 0 2
6 1 2003 0 3
7 2 1998 0 0
8 2 1999 0 0
9 2 2000 0 0
10 2 2001 0 0
11 2 2002 1 0
12 2 2003 0 1
13 3 1998 0 0
14 3 1999 1 0
15 3 2000 0 1
16 3 2001 0 2
17 3 2002 0 3
18 3 2003 0 4
我怎样才能做到这一点?我得到的最接近的是这里:
但我不知道如何修改它,让它只在事件发生一次后开始,并在事件前置零。还有一些个人没有事件,计数器需要给出零。每个人的年数(观察)是不同的,因此一些 id 的范围是 1984-1999,而其他 id 的范围是 1995-2015。
你会帮我很大的忙,我想提前感谢你的时间和努力。
此致,
朱利叶斯
可能不是最优雅的版本,但如果您的数据集不是太大,以下几行可能是一个开始。
library(data.table)
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
DT <- as.data.table(df)
get_delta <- function(x) {
if (all(x == 0)) {
return(x)
} else {
event_position <- which(x == 1)
x[event_position] <- 0
if (event_position == length(x)) {
return(x)
} else {
x[(event_position+1):length(x)] <- seq(length(x)-event_position)
return(x)
}
}
}
DT[, delta:= get_delta(event), by = c("id")]
DT
# id year event delta
# 1: 1 1998 0 0
# 2: 1 1999 0 0
# 3: 1 2000 1 0
# 4: 1 2001 0 1
# 5: 1 2002 0 2
# 6: 1 2003 0 3
# 7: 2 1998 0 0
# 8: 2 1999 0 0
# 9: 2 2000 0 0
# 10: 2 2001 0 0
# 11: 2 2002 1 0
# 12: 2 2003 0 1
# 13: 3 1998 0 0
# 14: 3 1999 1 0
# 15: 3 2000 0 1
# 16: 3 2001 0 2
# 17: 3 2002 0 3
# 18: 3 2003 0 4
n_rows <- 1e6
DT_large <- data.table(id= as.character(rep(c(1:n_rows), each=6))
,year=rep(1998:2003, n_rows),
event = as.vector(sapply(1:n_rows, function(x) {
x <- rep(0, 6)
x[sample(6, 1)] <- 1
x
}))
,stringsAsFactors=FALSE)
system.time(DT_large[, delta:= get_delta(event), by = c("id")])
# User System elapsed
# 9.30 0.02 9.35
#some benchmarking...
library(tidyverse)
library(data.table)
library(microbenchmark)
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
CPak_approach <- function() {
df %>%
group_by(id) %>%
mutate(delta = ifelse(cumsum(cummax(event)) > 0, cumsum(cummax(event)) - 1, 0)) %>%
ungroup()
}
manuelbickel_approach <- function(x) {
DT <- as.data.table(df)
get_delta <- function(x) {
if (all(x == 0)) {
return(x)
} else {
event_position <- which(x == 1)
x[event_position] <- 0
if (event_position == length(x)) {
return(x)
} else {
x[(event_position+1):length(x)] <- seq(length(x)-event_position)
return(x)
}
}
}
DT[, delta:= get_delta(event), by = c("id")]
}
microbenchmark(
(dplyr_approach()),
(manuelbickel_approach())
)
# Unit: microseconds
# expr min lq mean median uq max neval
# (dplyr_approach()) 3731.146 3872.6625 4098.923 3985.363 4194.183 6441.475 100
# (manuelbickel_approach()) 803.705 829.5605 1148.891 1014.105 1049.829 13993.372 100
您可以使用 group_by(id)
和 cumsum(cummax(event))
来接近 - 从 event==1
开始生成 1...N
。我将其包装在 ifelse(...)
中,以便从 > 0
.
的值中减去 1
library(tidyverse)
df %>%
group_by(id) %>%
mutate(delta = ifelse(cumsum(cummax(event)) > 0, cumsum(cummax(event)) - 1, 0)) %>%
ungroup()
# A tibble: 18 x 4
# id year event delta
# <chr> <int> <dbl> <dbl>
# 1 1 1998 0. 0.
# 2 1 1999 0. 0.
# 3 1 2000 1. 0.
# 4 1 2001 0. 1.
# 5 1 2002 0. 2.
# 6 1 2003 0. 3.
# 7 2 1998 0. 0.
# 8 2 1999 0. 0.
# 9 2 2000 0. 0.
# 10 2 2001 0. 0.
# 11 2 2002 1. 0.
# 12 2 2003 0. 1.
# 13 3 1998 0. 0.
# 14 3 1999 1. 0.
# 15 3 2000 0. 1.
# 16 3 2001 0. 2.
# 17 3 2002 0. 3.
# 18 3 2003 0. 4.
对于面板数据集 (GSOEP),我需要创建一个时间计数器,在事件发生后为我提供 delta t,该事件对于每个人在特定年份的虚拟编码为 1。例如。有一个人在随机年份范围内的观察结果,例如 1990-2006,有一个单独的变量表示年份中的某个事件为 1,例如1996. 计数器需要在下一年开始,应该以下一个人 (id) 结束,并且需要在该人的事件发生之前为零。
目前的数据是这样的:
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
id year event
1 1 1998 0
2 1 1999 0
3 1 2000 1
4 1 2001 0
5 1 2002 0
6 1 2003 0
7 2 1998 0
8 2 1999 0
9 2 2000 0
10 2 2001 0
11 2 2002 1
12 2 2003 0
13 3 1998 0
14 3 1999 1
15 3 2000 0
16 3 2001 0
17 3 2002 0
18 3 2003 0
需要的是这个:
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0),delta=c(0,0,0,1,2,3,0,0,0,0,0,1,0,0,1,2,3,4), stringsAsFactors=FALSE)
id year event delta
1 1 1998 0 0
2 1 1999 0 0
3 1 2000 1 0
4 1 2001 0 1
5 1 2002 0 2
6 1 2003 0 3
7 2 1998 0 0
8 2 1999 0 0
9 2 2000 0 0
10 2 2001 0 0
11 2 2002 1 0
12 2 2003 0 1
13 3 1998 0 0
14 3 1999 1 0
15 3 2000 0 1
16 3 2001 0 2
17 3 2002 0 3
18 3 2003 0 4
我怎样才能做到这一点?我得到的最接近的是这里:
但我不知道如何修改它,让它只在事件发生一次后开始,并在事件前置零。还有一些个人没有事件,计数器需要给出零。每个人的年数(观察)是不同的,因此一些 id 的范围是 1984-1999,而其他 id 的范围是 1995-2015。
你会帮我很大的忙,我想提前感谢你的时间和努力。
此致,
朱利叶斯
可能不是最优雅的版本,但如果您的数据集不是太大,以下几行可能是一个开始。
library(data.table)
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
DT <- as.data.table(df)
get_delta <- function(x) {
if (all(x == 0)) {
return(x)
} else {
event_position <- which(x == 1)
x[event_position] <- 0
if (event_position == length(x)) {
return(x)
} else {
x[(event_position+1):length(x)] <- seq(length(x)-event_position)
return(x)
}
}
}
DT[, delta:= get_delta(event), by = c("id")]
DT
# id year event delta
# 1: 1 1998 0 0
# 2: 1 1999 0 0
# 3: 1 2000 1 0
# 4: 1 2001 0 1
# 5: 1 2002 0 2
# 6: 1 2003 0 3
# 7: 2 1998 0 0
# 8: 2 1999 0 0
# 9: 2 2000 0 0
# 10: 2 2001 0 0
# 11: 2 2002 1 0
# 12: 2 2003 0 1
# 13: 3 1998 0 0
# 14: 3 1999 1 0
# 15: 3 2000 0 1
# 16: 3 2001 0 2
# 17: 3 2002 0 3
# 18: 3 2003 0 4
n_rows <- 1e6
DT_large <- data.table(id= as.character(rep(c(1:n_rows), each=6))
,year=rep(1998:2003, n_rows),
event = as.vector(sapply(1:n_rows, function(x) {
x <- rep(0, 6)
x[sample(6, 1)] <- 1
x
}))
,stringsAsFactors=FALSE)
system.time(DT_large[, delta:= get_delta(event), by = c("id")])
# User System elapsed
# 9.30 0.02 9.35
#some benchmarking...
library(tidyverse)
library(data.table)
library(microbenchmark)
df <- data.frame(id= rep(c("1","2","3"), each=6), year=rep(1998:2003, times=3), event=c(0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0), stringsAsFactors=FALSE)
CPak_approach <- function() {
df %>%
group_by(id) %>%
mutate(delta = ifelse(cumsum(cummax(event)) > 0, cumsum(cummax(event)) - 1, 0)) %>%
ungroup()
}
manuelbickel_approach <- function(x) {
DT <- as.data.table(df)
get_delta <- function(x) {
if (all(x == 0)) {
return(x)
} else {
event_position <- which(x == 1)
x[event_position] <- 0
if (event_position == length(x)) {
return(x)
} else {
x[(event_position+1):length(x)] <- seq(length(x)-event_position)
return(x)
}
}
}
DT[, delta:= get_delta(event), by = c("id")]
}
microbenchmark(
(dplyr_approach()),
(manuelbickel_approach())
)
# Unit: microseconds
# expr min lq mean median uq max neval
# (dplyr_approach()) 3731.146 3872.6625 4098.923 3985.363 4194.183 6441.475 100
# (manuelbickel_approach()) 803.705 829.5605 1148.891 1014.105 1049.829 13993.372 100
您可以使用 group_by(id)
和 cumsum(cummax(event))
来接近 - 从 event==1
开始生成 1...N
。我将其包装在 ifelse(...)
中,以便从 > 0
.
library(tidyverse)
df %>%
group_by(id) %>%
mutate(delta = ifelse(cumsum(cummax(event)) > 0, cumsum(cummax(event)) - 1, 0)) %>%
ungroup()
# A tibble: 18 x 4
# id year event delta
# <chr> <int> <dbl> <dbl>
# 1 1 1998 0. 0.
# 2 1 1999 0. 0.
# 3 1 2000 1. 0.
# 4 1 2001 0. 1.
# 5 1 2002 0. 2.
# 6 1 2003 0. 3.
# 7 2 1998 0. 0.
# 8 2 1999 0. 0.
# 9 2 2000 0. 0.
# 10 2 2001 0. 0.
# 11 2 2002 1. 0.
# 12 2 2003 0. 1.
# 13 3 1998 0. 0.
# 14 3 1999 1. 0.
# 15 3 2000 0. 1.
# 16 3 2001 0. 2.
# 17 3 2002 0. 3.
# 18 3 2003 0. 4.