用 R 识别剧集

Identifying episodes with R

多年来,我一直在使用一个包含不同国家/地区信息的不平衡面板。 我拥有的变量之一是 dist,它代表 HP 过滤后的 GDP 的对数偏差。如果 dist>1,则 dummy=1.

structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 
10L, 10L, 10L, 10L), .Label = c("Bangladesh", "Barbados", "Benin", 
"Burundi", "Cameroon", "Chile", "Cyprus", "Ecuador", "Equatorial Guinea", 
"Gabon", "Ghana", "Guatemala", "Guinea", "Guyana", "Haiti", "India", 
"Jamaica", "Jordan", "Lebanon", "Liberia", "Madagascar", "Mali", 
"Mexico", "Morocco", "Mozambique", "Nepal", "Nicaragua", "Niger", 
"Oman", "Pakistan", "Panama", "Papua New Guinea", "Peru", "Rwanda", 
"Senegal", "Seychelles", "Sierra Leone", "Singapore", "Sri Lanka", 
"Sudan", "Togo", "Trinidad and Tobago", "Tunisia", "Turkey", 
"Uganda", "Zambia", "Zimbabwe"), class = c("pseries", "factor"
)), date = structure(c(12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 40L, 
41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 
54L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("1965", "1966", "1967", 
"1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", 
"1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", 
"1984", "1985", "1986", "1987", "1988", "1989", "1990", "1991", 
"1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", 
"2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", 
"2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
"2016", "2017", "2018"), class = c("pseries", "factor")), dist = c(-2.44153863355447, 
-0.565497304455015, 1.4222058591902, 1.18583114700364, 0.936095936859405, 
-0.40582369098349, 2.08296049774769, -1.59377116576285, 0.316969269882462, 
-0.562808458278515, -0.438155688763343, 0.824980940313303, -0.60293744113302, 
0.397182279339227, 1.34777053122572, 0.00539983501313633, 0.0528824118570846, 
-0.838507946421168, 0.220969370384196, -1.06945294612116, 0.157324834050942, 
0.244080457191975, 0.215931770884978, 0.226176571912671, 0.197663973781873, 
0.183399695771442, 0.334605869075331, 0.0683789169881243, -0.0432432418415798, 
-0.191834311017179, -0.368292801016647, -0.572222256415503, -0.838601784768107, 
-1.15213567684887, -1.49844386483361, 2.39265920977342, 1.90573467650352, 
1.3362714819111, 0.158391029649629, 1.12699318386612, 2.38996532864524
), dummy = c(0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 
1, 0, 1, 1)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 400L, 
401L, 402L, 403L, 404L, 405L, 406L, 407L, 408L, 409L, 410L, 411L, 
412L, 413L, 414L, 415L, 416L, 417L, 418L, 419L, 420L), class = "data.frame")

我想根据 dist 和 dummy 的值按以下方式识别剧集。 情节被定义为 dist>0.5 的值的时间垃圾邮件,条件是 dist>=1 在某些时候。 在上面的例子中,孟加拉国的事件从 1978 年到 1980 年持续了 3 年,而危地马拉的事件又持续了 2006 年到 2008 年的 3 年 任何帮助将不胜感激!

我认为这符合您的要求。我使用 dplyr 进行数据操作,还使用 ​​data.table 包中的实用函数 rleid

x %>%
  group_by(country) %>%
  mutate(
    potential_episode_grp = (dist > 0.5) * data.table::rleid(dist > 0.5)
  ) %>%
  group_by(country, potential_episode_grp) %>%
  mutate(episode = as.integer(potential_episode_grp > 0 & any(dist > 1)))
# # A tibble: 11 × 6
# # Groups:   country, potential_episode_grp [4]
#    country     date   dist dummy episode potential_episode_grp
#    <chr>      <int>  <dbl> <int>   <int>                 <int>
#  1 Bangladesh  1976 -2.44      0       0                     0
#  2 Bangladesh  1977 -0.565     0       0                     0
#  3 Bangladesh  1978  1.42      1       1                     2
#  4 Bangladesh  1979  1.19      1       1                     2
#  5 Bangladesh  1980  0.936     0       1                     2
#  6 Guatemala   2004 -0.390     0       0                     0
#  7 Guatemala   2005 -0.267     0       0                     0
#  8 Guatemala   2006  1.46      1       1                     2
#  9 Guatemala   2007  0.763     0       1                     2
# 10 Guatemala   2008  1.11      1       1                     2
# 11 Guatemala   2009  0.289     0       0                     0

使用此数据:

x = read.table(text = ' country       date  dist       dummy episode
1  Bangladesh 1976 -2.4415386     0   0
2  Bangladesh 1977 -0.5654973     0   0
3  Bangladesh 1978  1.4222059     1   1
4  Bangladesh 1979  1.1858311     1   1
5  Bangladesh 1980  0.9360959     0   1
560  Guatemala 2004 -0.3899023    0   0
561  Guatemala 2005 -0.2672316    0   0
562  Guatemala 2006  1.4632532    1   1
563  Guatemala 2007  0.7633748    0   1
564  Guatemala 2008  1.1143216    1   1
565  Guatemala 2009  0.2891909    0   0', header =T)