(更新)根据两列将索引列添加到data.frame

(Update) Add index column to data.frame based on two columns

示例data.frame:

df = read.table(text = 'colA colB
                2 7
                2 7
                2 7
                2 7
                1 7
                1 7
                1 7
                89 5
                89 5
                89 5
                88 5
                88 5
                70 5
                70 5
                70 5
                69 5
                69 5
                44 4
                44 4
                44 4
                43 4
                42 4
                42 4
                41 4
                41 4
                120 1
                100 1', header = TRUE)

我需要添加一个基于 colAcolB 的索引列,其中 colB 显示要分组的确切行数,但它可以重复。 colB 根据 colAcolA -1.

对行进行分组

预期输出:

 colA colB index_col
    2 7       1
    2 7       1
    2 7       1
    2 7       1
    1 7       1
    1 7       1
    1 7       1
    89 5      2
    89 5      2
    89 5      2
    88 5      2
    88 5      2
    70 5      3
    70 5      3
    70 5      3
    69 5      3
    69 5      3
    44 4      4
    44 4      4 
    44 4      4
    43 4      4
    42 4      5
    42 4      5
    41 4      5
    41 4      5
   120 1      6
   100 1      7

更新

我如何调整适用于上述 df 的代码以达到相同的目的,但要查看根据 colAcolA -1colB 分组的值colA -2?即(而不是考虑 3 天的 2 天)

new_df = read.table(text = 'colA colB
        3 10
        3 10
        3 10      
        2 10
        2 10
        2 10
        2 10
        1 10
        1 10
        1 10
        90 7
        90 7
        89 7
        89 7
        89 7
        88 7
        88 7
        71 7
        71 7
        70 7
        70 7
        70 7
        69 7
        69 7
        44 5
        44 5
        44 5
        43 5
        42 5
        41 5
        41 5
        41 5
        40 5
        40 5
       120 1
       100 1', header = TRUE)

预期输出:

     colA colB index_col
        3 10      1
        3 10      1
        3 10      1
        2 10      1
        2 10      1
        2 10      1
        2 10      1
        1 10      1
        1 10      1
        1 10      1
        90 7      2
        90 7      2
        89 7      2
        89 7      2
        89 7      2
        88 7      2
        88 7      2
        71 7      3
        71 7      3
        70 7      3
        70 7      3
        70 7      3
        69 7      3
        69 7      3
        44 5      4
        44 5      4
        44 5      4
        43 5      4
        42 5      4
        41 5      5
        41 5      5
        41 5      5
        40 5      5
        40 5      5
       120 1      6
       100 1      7

谢谢

我们可以使用rleid

library(data.table)
index_col <-setDT(df)[, if(colB[1L] < .N) ((seq_len(.N)-1) %/% colB[1L])+1  
       else as.numeric(colB), rleid(colB)][, rleid(V1)]
df[, index_col := index_col]
df
#    colA colB index_col
# 1:    2    7         1
# 2:    2    7         1
# 3:    2    7         1
# 4:    2    7         1
# 5:    1    7         1
# 6:    1    7         1
# 7:    1    7         1
# 8:   70    5         2
# 9:   70    5         2
#10:   70    5         2
#11:   69    5         2
#12:   69    5         2
#13:   89    5         3
#14:   89    5         3
#15:   89    5         3
#16:   88    5         3
#17:   88    5         3
#18:  120    1         4
#19:  100    1         5

或者单线

setDT(df)[, index_col := df[, ((seq_len(.N)-1) %/% colB[1L])+1, rleid(colB)][, as.integer(interaction(.SD, drop = TRUE, lex.order = TRUE))]]

更新

基于 OP post

中的新更新
setDT(new_df)[, index_col :=  cumsum(c(TRUE, abs(diff(colA))> 1))
          ][, colB := .N , index_col]
new_df
#    colA colB index_col
# 1:    3   10         1
# 2:    3   10         1
# 3:    3   10         1
# 4:    2   10         1
# 5:    2   10         1
# 6:    2   10         1
# 7:    2   10         1
# 8:    1   10         1
# 9:    1   10         1
#10:    1   10         1
#11:   71    7         2
#12:   71    7         2
#13:   70    7         2
#14:   70    7         2
#15:   70    7         2
#16:   69    7         2
#17:   69    7         2
#18:   90    7         3
#19:   90    7         3
#20:   89    7         3
#21:   89    7         3
#22:   89    7         3
#23:   88    7         3
#24:   88    7         3
#25:   44    2         4
#26:   43    2         4
#27:  120    1         5
#28:  100    1         6

基于 R 的一种方法:

df$idxcol <- cumsum(c(1,abs(diff(df$colA)) > 1) + c(0,diff(df$colB) != 0) > 0)

给出:

> df
   colA colB idxcol
1     2    7      1
2     2    7      1
3     2    7      1
4     2    7      1
5     1    7      1
6     1    7      1
7     1    7      1
8    70    5      2
9    70    5      2
10   70    5      2
11   69    5      2
12   69    5      2
13   89    5      3
14   89    5      3
15   89    5      3
16   88    5      3
17   88    5      3
18  120    1      4
19  100    1      5

在更新的示例数据上,您需要将方法调整为:

n <- 1
idx1 <- cumsum(c(1, diff(df$colA) < -n) + c(0, diff(df$colB) != 0) > 0)
idx2 <- ave(df$colA, cumsum(c(1, diff(df$colA) < -n)), FUN = function(x) c(0, cumsum(diff(x)) < -n ))
idx2[idx2==1 & c(0,diff(idx2))==0] <- 0

df$idxcol <- idx1 + cumsum(idx2)

给出:

> df
   colA colB idxcol
1     2    7      1
2     2    7      1
3     2    7      1
4     2    7      1
5     1    7      1
6     1    7      1
7     1    7      1
8    89    5      2
9    89    5      2
10   89    5      2
11   88    5      2
12   88    5      2
13   70    5      3
14   70    5      3
15   70    5      3
16   69    5      3
17   69    5      3
18   44    4      4
19   44    4      4
20   44    4      4
21   43    4      4
22   42    4      5
23   42    4      5
24   41    4      5
25   41    4      5
26  120    1      6
27  100    1      7

对于 new_df,只需将 n 更改为 2,您也将获得所需的输出。