在 R 中创建条件虚拟变量列
Creating a conditional dummy variable column in R
我正在处理一个跨国面板数据集,我的一个变量 (cc_dummy
) 取值 1 和 0(也有 NA 指示的缺失值)。我想创建一个新列,如果 cc_dummy
每个国家/地区连续三年取值 1,我只希望每三年 window 的第一年取值 1,而其他年份取值 0。下面给出了我的数据快照。任何帮助,将不胜感激。如果可能的话,我想使用 dplyr 创建列。
structure(list(country = c("Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil"), year = c(1975, 1976, 1977, 1978, 1979, 1980,
1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
2014, 2015, 2016, 2017, 2018, 2019, 2020, 1975, 1976, 1977, 1978,
1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021),
cc_dummy = c(NA, NA, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, NA, NA, 0, 0, 1, 0,
1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0)), row.names = c(NA, -93L), class = c("tbl_df", "tbl",
"data.frame"))
下面是使用 data.table::rleid
和 dplyr
语法的方法:
library(dplyr)
df %>%
group_by(country, m = data.table::rleid(cc_dummy)) %>%
mutate(newcol = ifelse(n() >= 3 & cumsum(cc_dummy) == 1, 1, 0))
# A tibble: 93 x 5
# Groups: country, m [32]
country year cc_dummy m newcol
<chr> <dbl> <dbl> <int> <dbl>
1 Argentina 1975 NA 1 0
2 Argentina 1976 NA 1 0
3 Argentina 1977 0 2 0
4 Argentina 1978 0 2 0
5 Argentina 1979 0 2 0
6 Argentina 1980 0 2 0
7 Argentina 1981 1 3 0
8 Argentina 1982 1 3 0
9 Argentina 1983 0 4 0
10 Argentina 1984 1 5 0
11 Argentina 1985 0 6 0
12 Argentina 1986 0 6 0
13 Argentina 1987 1 7 1
14 Argentina 1988 1 7 0
15 Argentina 1989 1 7 0
16 Argentina 1990 0 8 0
17 Argentina 1991 0 8 0
18 Argentina 1992 0 8 0
19 Argentina 1993 0 8 0
20 Argentina 1994 0 8 0
21 Argentina 1995 0 8 0
22 Argentina 1996 0 8 0
23 Argentina 1997 0 8 0
24 Argentina 1998 0 8 0
25 Argentina 1999 0 8 0
26 Argentina 2000 0 8 0
27 Argentina 2001 0 8 0
28 Argentina 2002 1 9 0
我正在处理一个跨国面板数据集,我的一个变量 (cc_dummy
) 取值 1 和 0(也有 NA 指示的缺失值)。我想创建一个新列,如果 cc_dummy
每个国家/地区连续三年取值 1,我只希望每三年 window 的第一年取值 1,而其他年份取值 0。下面给出了我的数据快照。任何帮助,将不胜感激。如果可能的话,我想使用 dplyr 创建列。
structure(list(country = c("Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",
"Brazil", "Brazil"), year = c(1975, 1976, 1977, 1978, 1979, 1980,
1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
2014, 2015, 2016, 2017, 2018, 2019, 2020, 1975, 1976, 1977, 1978,
1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021),
cc_dummy = c(NA, NA, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, NA, NA, 0, 0, 1, 0,
1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0)), row.names = c(NA, -93L), class = c("tbl_df", "tbl",
"data.frame"))
下面是使用 data.table::rleid
和 dplyr
语法的方法:
library(dplyr)
df %>%
group_by(country, m = data.table::rleid(cc_dummy)) %>%
mutate(newcol = ifelse(n() >= 3 & cumsum(cc_dummy) == 1, 1, 0))
# A tibble: 93 x 5
# Groups: country, m [32]
country year cc_dummy m newcol
<chr> <dbl> <dbl> <int> <dbl>
1 Argentina 1975 NA 1 0
2 Argentina 1976 NA 1 0
3 Argentina 1977 0 2 0
4 Argentina 1978 0 2 0
5 Argentina 1979 0 2 0
6 Argentina 1980 0 2 0
7 Argentina 1981 1 3 0
8 Argentina 1982 1 3 0
9 Argentina 1983 0 4 0
10 Argentina 1984 1 5 0
11 Argentina 1985 0 6 0
12 Argentina 1986 0 6 0
13 Argentina 1987 1 7 1
14 Argentina 1988 1 7 0
15 Argentina 1989 1 7 0
16 Argentina 1990 0 8 0
17 Argentina 1991 0 8 0
18 Argentina 1992 0 8 0
19 Argentina 1993 0 8 0
20 Argentina 1994 0 8 0
21 Argentina 1995 0 8 0
22 Argentina 1996 0 8 0
23 Argentina 1997 0 8 0
24 Argentina 1998 0 8 0
25 Argentina 1999 0 8 0
26 Argentina 2000 0 8 0
27 Argentina 2001 0 8 0
28 Argentina 2002 1 9 0