在 R 中创建与数据时序相对应的序列指示符

Creating sequence indicators corresponding to data timing in R

我正在处理如下所示的数据:

   ID Year Variable_of_Interest
1   a 2000                    0
2   a 2001                    0
3   a 2002                    0
4   a 2003                    0
5   a 2004                    0
6   a 2005                    1
7   a 2006                    1
8   a 2007                    1
9   a 2008                    1
10  a 2009                    1
11  b 2000                    0
12  b 2001                    0
13  b 2002                    0
14  b 2003                    1
15  b 2004                    1
16  b 2005                    1
17  b 2006                    0
18  b 2007                    1
19  b 2008                    1
20  b 2009                    1
21  c 2000                    0
22  c 2001                    0
23  c 2002                    0
24  c 2003                    0
25  c 2004                    0
26  c 2005                    0
27  c 2006                    1
28  c 2007                    1
29  c 2008                    1
30  c 2009                    0
31  d 2000                    0
32  d 2001                    0
33  d 2002                    1
34  d 2003                    1
35  d 2004                    0
36  d 2005                    1
37  d 2006                    1
38  d 2007                    1
39  d 2008                    1
40  d 2009                    1

分析单位在ID列。这些 ID 在数据中每年都会重复。

variable of interest 列表示对 ID 的更改,其中某些年份的值为 0,而其他年份的值为 1。

我想创建一个额外的列,其中包含记录代码更改前后时间的数字序列(定义为从 0 到 1)在 Variable_of_Interest 的前后一年改变。

代码必须说明重复代码更改(定义为从 0 到 1),例如 ID b 从 2002-2003 和 2006-2007。

NA 可以分配给 0 值而不更改回 1,例如 "c" 2009 中的 0。

这样,数据看起来像:

   ID Year Variable_of_Interest Solution
1   a 2000                    0       -5
2   a 2001                    0       -4
3   a 2002                    0       -3
4   a 2003                    0       -2
5   a 2004                    0       -1
6   a 2005                    1        0
7   a 2006                    1        1
8   a 2007                    1        2
9   a 2008                    1        3
10  a 2009                    1        4
11  b 2000                    0       -3
12  b 2001                    0       -2
13  b 2002                    0       -1
14  b 2003                    1        0
15  b 2004                    1        1
16  b 2005                    1        2
17  b 2006                    0       -1
18  b 2007                    1        0
19  b 2008                    1        1
20  b 2009                    1        2
21  c 2000                    0       -6
22  c 2001                    0       -5
23  c 2002                    0       -4
24  c 2003                    0       -3
25  c 2004                    0       -2
26  c 2005                    0       -1
27  c 2006                    1        0
28  c 2007                    1        1
29  c 2008                    1        2
30  c 2009                    0       NA
31  d 2000                    0       -2
32  d 2001                    0       -1
33  d 2002                    1        0
34  d 2003                    1        1
35  d 2004                    0       -1
36  d 2005                    1        0
37  d 2006                    1        1
38  d 2007                    1        2
39  d 2008                    1        3
40  d 2009                    1        4

复制代码如下:

ID <- c(rep("a",10), rep("b", 10), rep("c", 10), rep("d", 10)); length(ID)
Year <- rep(seq(2000,2009, 1), 4)
Variable_of_Interest <- c(rep(0,5), rep(1, 5), 
                         rep(0,3), rep(1, 3), rep(0, 1), rep(1, 3),
                         rep(0,6), rep(1, 3), rep(0, 1),
                         rep(0,2), rep(1, 2), rep(0,1), rep(1,5))


data.frame(ID, Year, Variable_of_Interest)

非常感谢您的帮助!

这里有一个dplyr选项:

library(dplyr)

df %>%
  group_by(ID, idx = cumsum(Variable_of_Interest != lag(Variable_of_Interest, default = first(Variable_of_Interest)))) %>%
  mutate(Solution = case_when(Variable_of_Interest == 0 ~ rev(-1:(-n())), Variable_of_Interest == 1 ~ 0:(n() - 1))) %>%
  group_by(ID) %>%
  mutate(Solution = replace(Solution, idx == max(idx) & Variable_of_Interest == 0, NA)) %>%
  select(-idx)

输出:

   ID Year Variable_of_Interest idx Solution
1   a 2000                    0   0       -5
2   a 2001                    0   0       -4
3   a 2002                    0   0       -3
4   a 2003                    0   0       -2
5   a 2004                    0   0       -1
6   a 2005                    1   1        0
7   a 2006                    1   1        1
8   a 2007                    1   1        2
9   a 2008                    1   1        3
10  a 2009                    1   1        4
11  b 2000                    0   2       -3
12  b 2001                    0   2       -2
13  b 2002                    0   2       -1
14  b 2003                    1   3        0
15  b 2004                    1   3        1
16  b 2005                    1   3        2
17  b 2006                    0   4       -1
18  b 2007                    1   5        0
19  b 2008                    1   5        1
20  b 2009                    1   5        2
21  c 2000                    0   6       -6
22  c 2001                    0   6       -5
23  c 2002                    0   6       -4
24  c 2003                    0   6       -3
25  c 2004                    0   6       -2
26  c 2005                    0   6       -1
27  c 2006                    1   7        0
28  c 2007                    1   7        1
29  c 2008                    1   7        2
30  c 2009                    0   8       NA
31  d 2000                    0   8       -2
32  d 2001                    0   8       -1
33  d 2002                    1   9        0
34  d 2003                    1   9        1
35  d 2004                    0  10       -1
36  d 2005                    1  11        0
37  d 2006                    1  11        1
38  d 2007                    1  11        2
39  d 2008                    1  11        3
40  d 2009                    1  11        4

使用 data.table 的另一个选项:

#identify runs
setDT(DF)[, ri := rleid(voi)]

#generate the desired output depending on whether VOI is 1 or 0
DF[, soln := if (voi[1L]==1L) seq.int(.N) - 1L else -rev(seq.int(.N)), .(ID, ri)]

#replace trailing 0 with NA
DF[, soln := if(voi[.N]==0L) replace(soln, ri==ri[.N], NA_integer_) else soln, ID]

数据:

ID <- c(rep("a",10), rep("b", 10), rep("c", 10), rep("d", 10)); length(ID)
Year <- rep(seq(2000,2009, 1), 4)
Variable_of_Interest <- c(rep(0,5), rep(1, 5), 
    rep(0,3), rep(1, 3), rep(0, 1), rep(1, 3),
    rep(0,6), rep(1, 3), rep(0, 1),
    rep(0,2), rep(1, 2), rep(0,1), rep(1,5))
DF <- data.frame(ID, Year, voi=Variable_of_Interest)