创建 运行 长度的 ID,同时允许 运行s 中一定长度的间隙

Create run-length ID while allowing for gaps of certain length in runs

(我最初发布了一个问题 here,但它没有完全涵盖我的问题)

我有一个包含 'date' 列和降水量(rainfall)的数据框:

  date precip
1    1    0.0
2    2    0.0
3    3   12.4
4    4   10.2
5    5    0.0
6    6   13.6

我想为每个连续的降雨期创建一个带有计数器 (ID) 的“事件”列。降雨事件可以定义为连续 运行s,降水量大于例如0.

如果我们不允许零雨的任何短间隙,'event' 将如下所示,非 0 时段有计数器,NA 时段有计数器没有下雨。

  date precip event
1    1    0.0    NA
2    2    0.0    NA
3    3   12.4     1
4    4   10.2     1
5    5    0.0    NA
6    6   13.6     2

此外,我希望能够允许更短的无雨时间,例如大小 n = 1 天,在非 0.

的每个 运行 内

例如,在上面的数据框中,如果我们允许在连续的降雨期间有 1 天有 0 次降雨,例如第 5 天,那么第 3 天到第 6 天可以定义为一次降雨事件:

  date precip event
1    1    0.0    NA
2    2    0.0    NA
3    3   12.4     1
4    4   10.2     1
5    5    0.0     1 # <- gap of 1 day with no rain: OK
6    6   13.6     1

稍微大一点的玩具数据集:

structure(list(date = 1:31, precip = c(0, 0, 12.3999996185303, 
10.1999998092651, 0, 13.6000003814697, 16.6000003814697, 21.5, 
7.59999990463257, 0, 0, 0, 0.699999988079071, 0, 0, 0, 5.40000009536743, 
0, 1, 35.4000015258789, 11.5, 16.7000007629395, 13.5, 13.1000003814697, 
11.8000001907349, 1.70000004768372, 0, 15.1000003814697, 12.8999996185303, 
3.70000004768372, 24.2999992370605)), row.names = c(NA, -31L), class = "data.frame")

现在我真的卡住了。我尝试了一些奇怪的事情,比如下面的(只是一个开始),但我想我不会自己弄清楚,如果有任何帮助我会非常感激

# this is far from being any helpful, but just to show the direction I was heading...
# the threshold could be 0 to mirror the example above...

rainfall_event = function(df,
                          daily_thresh = .2,
                          n = 1) {
  for (i in 1:nrow(df)) {
    zero_index = 1
    
    if (df[i,]$precip < daily_thresh) {
      # every time you encounter a value below the threshold count the 0s
      zero_counter = 0
      
      while (df[i,]$precip < daily_thresh) {

        zero_counter = zero_counter + 1
        
        if (i != nrow(df)) {
          i = i + 1
          zero_index = zero_index + 1
        } else{
          break
        }
      }
      
      if (zero_counter > n) {
        df[zero_index:zero_index + zero_counter,][["event"]] = NA
      }
      
    } else{
      event_counter = 1
      
      while (df[i, ]$precip > daily_thresh) {

        df[["event"]] = event_counter
        if (i != nrow(rainfall_one_slide)) {
          i = i + 1
        } else{
          break
        }
      }
      
    }
  }
  
}

data.tablerleid

结合使用
library(data.table)

f1 <- function(dat, n) {

 tmp <- as.data.table(dat)[,
      grp := rleid(precip != 0)][precip != 0, 
       event := .GRP,
        grp][, event_fill := nafill(nafill(event, 'locf'),
      'nocb')]
 tmp[, event := fifelse(.N <= n & precip == 0,
     fcoalesce(event, event_fill), event), grp][, 
         c("grp", "event_fill") := NULL][]

 }

-测试

f1(df1, 0)
     date precip event
 1:    1    0.0    NA
 2:    2    0.0    NA
 3:    3   12.4     1
 4:    4   10.2     1
 5:    5    0.0    NA
 6:    6   13.6     2
 7:    7   16.6     2
 8:    8   21.5     2
 9:    9    7.6     2
10:   10    0.0    NA
11:   11    0.0    NA
12:   12    0.0    NA
13:   13    0.7     3
14:   14    0.0    NA
15:   15    0.0    NA
16:   16    0.0    NA
17:   17    5.4     4
18:   18    0.0    NA
19:   19    1.0     5
20:   20   35.4     5
21:   21   11.5     5
22:   22   16.7     5
23:   23   13.5     5
24:   24   13.1     5
25:   25   11.8     5
26:   26    1.7     5
27:   27    0.0    NA
28:   28   15.1     6
29:   29   12.9     6
30:   30    3.7     6
31:   31   24.3     6

n = 1

f1(df1, 1)
    date precip event
 1:    1    0.0    NA
 2:    2    0.0    NA
 3:    3   12.4     1
 4:    4   10.2     1
 5:    5    0.0     1
 6:    6   13.6     2
 7:    7   16.6     2
 8:    8   21.5     2
 9:    9    7.6     2
10:   10    0.0    NA
11:   11    0.0    NA
12:   12    0.0    NA
13:   13    0.7     3
14:   14    0.0    NA
15:   15    0.0    NA
16:   16    0.0    NA
17:   17    5.4     4
18:   18    0.0     4
19:   19    1.0     5
20:   20   35.4     5
21:   21   11.5     5
22:   22   16.7     5
23:   23   13.5     5
24:   24   13.1     5
25:   25   11.8     5
26:   26    1.7     5
27:   27    0.0     5
28:   28   15.1     6
29:   29   12.9     6
30:   30    3.7     6
31:   31   24.3     6

一个rle备选方案:

# limit of n days with precip = 0 to be allowed in runs of non-zero
n = 1

# rle of precip == 0
r = rle(d$precip == 0)

# replace the values of precip = 0 & length > limit with NA
r$values[r$values & r$lengths > n] = NA

# reconstruct the vector from the updated runs
ir = inverse.rle(r)

# rle of "is NA"
r2 = rle(is.na(ir))

# replace length of NA runs with 0
r2$lengths[r2$values] = 0

# replace values of non-NA runs with a sequence
r2$values[!r2$values] = seq_along(r2$values[!r2$values])

# create event column
d[!is.na(ir), "event"] = inverse.rle(r2)

   date precip event
1     1    0.0    NA
2     2    0.0    NA
3     3   12.4     1
4     4   10.2     1
5     5    0.0     1
6     6   13.6     1
7     7   16.6     1
8     8   21.5     1
9     9    7.6     1
10   10    0.0    NA
11   11    0.0    NA
12   12    0.0    NA
13   13    0.7     2
14   14    0.0    NA
15   15    0.0    NA
16   16    0.0    NA
17   17    5.4     3
18   18    0.0     3
19   19    1.0     3
20   20   35.4     3
21   21   11.5     3
22   22   16.7     3
23   23   13.5     3
24   24   13.1     3
25   25   11.8     3
26   26    1.7     3
27   27    0.0     3
28   28   15.1     3
29   29   12.9     3
30   30    3.7     3
31   31   24.3     3

因此,可能永远不会有人对它感兴趣,但我想我也有一个解决方案:)

f2 = function(d,
              n = 1,
              daily_thresh = .2) {

  # start int the first row
  i = 1

  # start with rainfall event 1
  event_counter = 0
  
  # set the value initially to 0
  d[["event"]] = 0

  # while still in the dataframe
  while (i <= nrow(d)) {

    # get the current precip value
    precip = d[i,]$precip

    # if its below the threshold --> DRY period starts
    if (precip < daily_thresh) {

      # count unknown number of following dry days of this dry episode
      dry_days = 0

      ### DRY LOOP
      # start from the day with rainfall under the threshold
      for (j in i:nrow(d)) {

        # count the consecutive dry days
        if (d[j,]$precip < daily_thresh) {
          dry_days = dry_days + 1


        } else{

          # hit a rainy day --> Get out the dry loop, just decide to which event it belongs
          # if the preceeding dry days are smaller than n --> same as last event

          if (dry_days <= n) {

            # set all the days without rainfall but within n to rainfall
            # if its the first event put it to 1
            if(event_counter == 0) event_counter = 1
            d[(j-1):(j-dry_days),][["event"]] = event_counter
            # set the rainy day to the same event
            d[j,][["event"]] = event_counter
            break # get back to wet peiod

          } else{

            # if the gap was too big --> its a new event
            # set all the days without rainfall and within n to no rainfall
            d[(j-1):(j-dry_days),][["event"]] = NA
            # set the rainy day to a new rainfall event
            event_counter = event_counter + 1
            d[j,][["event"]] = event_counter
            break # get back to wet period
          }
        }
      }

      # set i to where we stopped in the dry loop
      i = j + 1

    } else{

      # if we initially hit a rainy day, just count on
      d[i,][["event"]] = event_counter
      i = i + 1

    }
  }
  return(d)
}