R根据另一个数据集的间隔填充新列(查找)

R fill new column based on interval from another dataset (lookup)

假设我有这个数据集:

df1 = data.frame(groupID = c(rep("a", 6), rep("b", 6), rep("c", 6)),
                  testid = c(111, 222, 333, 444, 555, 666, 777, 888, 999, 1010, 1111, 1212, 1313, 1414, 1515, 1616, 1717, 1818))

df1
   groupID testid
1        a    111
2        a    222
3        a    333
4        a    444
5        a    555
6        a    666
7        b    777
8        b    888
9        b    999
10       b   1010
11       b   1111
12       b   1212
13       c   1313
14       c   1414
15       c   1515
16       c   1616
17       c   1717
18       c   1818

我有第二个数据集:

df2 = data.frame(groupID = c("a", "a", "a", "a", "b", "b", "b", "c", "c", "c"),
                 testid = c(222, 333, 555, 666, 777, 999, 1010, 1313, 1616, 1818),
                 bd = c(1, 1, 2, 2, 0, 1, 1, 1, 1, 2))
df2
   groupID testid bd
1        a    222  1
2        a    333  1
3        a    555  2
4        a    666  2
5        b    777  0
6        b    999  1
7        b   1010  1
8        c   1313  1
9        c   1616  1
10       c   1818  2

我想使用第二个数据集中的间隔来填充第一个数据集中的新变量,并自动填充在其他地方按组出现两次 bdNAs 的值.

期望的输出:

   groupID testid new_bd
1        a    111     NA
2        a    222      1
3        a    333      1
4        a    444     NA
5        a    555      2
6        a    666      2
7        b    777      0
8        b    888     NA
9        b    999      1
10       b   1010      1
11       b   1111     NA
12       b   1212     NA
13       c   1313      1
14       c   1414      1
15       c   1515      1
16       c   1616      1
17       c   1717     NA
18       c   1818      2

理想情况下想要 dplyr/tidyr 解决方案,但对任何方法都持开放态度。

相似,但这些填充了所有值: R: Filling timeseries values but only within last 12 months

我将从修改 df2 开始和结束范围开始。然后你可以循环或做任何其他事情。

grps <- df2 %>% group_by(groupID, bd) %>% summarize(start = min(testid), end = max(testid))

grps
groupID    bd start   end
 <fct>   <dbl> <dbl> <dbl>
1 a           1   222   333
2 a           2   555   666
3 b           0   777   777
4 b           1   999  1010
5 c           1  1313  1616
6 c           2  1818  1818

df1$bd <- NA
for(i in 1:nrow(grps)){
  df1$bd[which(df1$test >= grps$start[i] & df1$test <= grps$end[i])] = grps$bd[i]
}

df1
     groupID testid bd
1        a    111 NA
2        a    222  1
3        a    333  1
4        a    444 NA
5        a    555  2
6        a    666  2
7        b    777  0
8        b    888 NA
9        b    999  1
10       b   1010  1
11       b   1111 NA
12       b   1212 NA
13       c   1313  1
14       c   1414  1
15       c   1515  1
16       c   1616  1
17       c   1717 NA
18       c   1818  2

也许我忽略了一个更简单的方法,但这是我使用 dplyr 想出的方法,我们首先在 df1df2 之间创建一个 left_joinfill bd 列。然后我们 group_by group_IDbd 并获得每个组中非 NA 值的第一个和最后一个索引,并将小于最小索引且大于的值替换为 NA最大索引。

library(dplyr)

left_join(df1, df2, by = c("groupID", "testid")) %>%
   mutate(bd1 = bd) %>%
   tidyr::fill(bd) %>%
   group_by(groupID, bd) %>%
   mutate(minRow = if (all(is.na(bd))) 1 else first(which(!is.na(bd1))), 
           maxRow = if (all(is.na(bd))) n() else last(which(!is.na(bd1))), 
           new_bd = replace(bd, is.na(bd1) & (row_number() < minRow | 
                                              row_number() > maxRow), NA)) %>%
    ungroup() %>%
    select(names(df1), new_bd)


#   groupID testid new_bd
#   <fct>    <dbl>  <dbl>
# 1 a          111     NA
# 2 a          222      1
# 3 a          333      1
# 4  a          444     NA
# 5 a          555      2
# 6 a          666      2
# 7 b          777      0
# 8 b          888     NA
# 9 b          999      1
#10 b         1010      1
#11 b         1111     NA
#12 b         1212     NA
#13 c         1313      1
#14 c         1414      1
#15 c         1515      1
#16 c         1616      1
#17 c         1717     NA
#18 c         1818      2

这是一个适用于我上面的测试数据示例的解决方案,但不会 运行 在我 运行 遇到 Error: cannot allocate vector of size 45.5 Gb 问题的大型数据集上。我相信它与概述的问题有关 here:"The same size explosion can happen if you have lots of the same level in both with otherwise different rows"。在我的实际数据集中,我正在查看日期变量,我认为这不会影响问题,但也许会。我不确定是否有使用 fuzzyjoin 的作品,因为它适用于数据的子集。

    library(tidyverse)
    library(fuzzyjoin)
    library(tidylog)

    grps <- df2 %>% group_by(groupID, bd) %>% summarize(start = min(testid), end = max(testid))
    grps

     df1 %>%
       fuzzy_left_join(grps, 
                       by = c("groupID" = "groupID",
                              "testid" = "start",
                              "testid" = "end"),
                       match_fun = list(`==`, `>=`, `<=`)) %>%
       select(groupID = groupID.x, testid, bd, start, end)
    select: dropped 2 variables (groupID.x, groupID.y)

       groupID testid bd start  end
    1        a    111 NA    NA   NA
    2        a    222  1   222  333
    3        a    333  1   222  333
    4        a    444 NA    NA   NA
    5        a    555  2   555  666
    6        a    666  2   555  666
    7        b    777  0   777  777
    8        b    888 NA    NA   NA
    9        b    999  1   999 1010
    10       b   1010  1   999 1010
    11       b   1111 NA    NA   NA
    12       b   1212 NA    NA   NA
    13       c   1313  1  1313 1616
    14       c   1414  1  1313 1616
    15       c   1515  1  1313 1616
    16       c   1616  1  1313 1616
    17       c   1717 NA    NA   NA
    18       c   1818  2  1818 1818

data.table解法:

library(data.table) 
> new <- setDT(grps)[setDT(df1), 
+                          .(groupID, testid, x.start, x.end, x.bd),
+                          on = .(groupID, start <= testid,  end >= testid)]
> new
    groupID testid x.start x.end x.bd
 1:       a    111      NA    NA   NA
 2:       a    222     222   333    1
 3:       a    333     222   333    1
 4:       a    444      NA    NA   NA
 5:       a    555     555   666    2
 6:       a    666     555   666    2
 7:       b    777     777   777    0
 8:       b    888      NA    NA   NA
 9:       b    999     999  1010    1
10:       b   1010     999  1010    1
11:       b   1111      NA    NA   NA
12:       b   1212      NA    NA   NA
13:       c   1313    1313  1616    1
14:       c   1414    1313  1616    1
15:       c   1515    1313  1616    1
16:       c   1616    1313  1616    1
17:       c   1717      NA    NA   NA
18:       c   1818    1818  1818    2

我认为可以使用 internal_joinfuzzyjoin 中完成,但我不确定?: https://github.com/dgrtwo/fuzzyjoin/issues/50