如何匹配列中的重复值、检查条件并在 R 中输出最终结果?

How can I match duplicated values in a column, examine the condition and output a final result in R?

我有一本这样的字典table:

ID Position Region
a 1-50 D1a
a 80-100 D2a
a 250-300 D3a
b 50-100 D1b
b 150-180 D2b
c 1-20 D1c
c 50-80 D2c
c 100-200 D3c
c 250-300 D4c

还有这样的目标 table:

ID Position
a 28
a 85
a 320
b 55
b 100
c 18
c 45
c 180
c 270

逻辑是检查targettable中的数值-Position是否在字典table中的position范围内,输出给定ID的region值

我首先想到 R 包 hash 应该可以。但是后来我发现哈希键必须是唯一的,而在我的情况下不是。

如何先匹配每个ID,然后使用if between()或其他函数将我的目标映射到各自的区域?

这是您的预期结果吗?

library(data.table)

dt1 <- fread("
ID  Position    Region
a   1-50    D1a
a   80-100  D2a
a   250-300 D3a
b   50-100  D1b
b   150-180 D2b
c   1-20    D1c
c   50-80   D2c
c   100-200 D3c
c   250-300 D4c
")

dt2 <- fread("
ID  Position
a   28
a   85
a   320
b   55
b   100
c   18
c   45
c   180
c   270
")

#dt1[,c("Position_left","Position_right") := tstrsplit(Position,"-")]
#dt1[, dt2$Position %between% list(Position_left,Position_right)]
# [1]  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE

dt1[,.( ID,
        Position = dt2$Position,
        Region = fifelse(dt2$Position %between% tstrsplit(Position,"-"),Region,NA)
       )]

       ID Position Region
   <char>    <int> <char>
1:      a       28    D1a
2:      a       85    D2a
3:      a      320   <NA>
4:      b       55    D1b
5:      b      100   <NA>
6:      c       18    D1c
7:      c       45   <NA>
8:      c      180    D3c
9:      c      270    D4c

也许像这样用 tidy verse 方法 (tidyr::separate)

然而,这是一个快速的回答,我不确定是否如您所愿地理解了您的数据关系。

library(tidyverse)

one <- tibble::tribble(
  ~ID, ~Position, ~Region,
  "a",    "1-50",   "D1a",
  "a",  "80-100",   "D2a",
  "a", "250-300",   "D3a",
  "b",  "50-100",   "D1b",
  "b", "150-180",   "D2b",
  "c",    "1-20",   "D1c",
  "c",   "50-80",   "D2c",
  "c", "100-200",   "D3c",
  "c", "250-300",   "D4c"
  )

two <- tibble::tribble(
         ~ID, ~Position,
         "a",       28L,
         "a",       85L,
         "a",      320L,
         "b",       55L,
         "b",      100L,
         "c",       18L,
         "c",       45L,
         "c",      180L,
         "c",      270L
         )


one_ <- one %>% 
  tidyr::separate(Position, c('p_min', 'p_max'), sep = "-") %>% 
  mutate_at(vars(starts_with('p_')), as.integer)

two %>% 
  mutate(rn = row_number()) %>% 
  left_join(one_) %>% 
  mutate(in_between = (Position >= p_min & Position <= p_max)) %>% 
  filter(in_between) %>% 
  distinct(rn, .keep_all = TRUE)
Joining, by = "ID"
# A tibble: 7 × 7
  ID    Position    rn p_min p_max Region in_between
  <chr>    <int> <int> <int> <int> <chr>  <lgl>     
1 a           28     1     1    50 D1a    TRUE      
2 a           85     2    80   100 D2a    TRUE      
3 b           55     4    50   100 D1b    TRUE      
4 b          100     5    50   100 D1b    TRUE      
5 c           18     6     1    20 D1c    TRUE      
6 c          180     8   100   200 D3c    TRUE      
7 c          270     9   250   300 D4c    TRUE  

在将 Position 列拆分为两列后,您可以使用程序包 fuzzyjoin 执行范围连接。

使用来自@Guillaume 的数据 -

library(dplyr)
library(fuzzyjoin)
library(tidyr)

one %>%
  separate(Position, c('min', 'max'), sep = '-', convert = TRUE) %>%
  fuzzy_right_join(two, by = c('ID', 'min' = 'Position', 'max' = 'Position'), 
                  match_fun = c(`==`, `<=`, `>=`)) %>%
  select(ID = ID.y, Position, min, max, Region)

#  ID    Position   min   max Region
#  <chr>    <int> <int> <int> <chr> 
#1 a           28     1    50 D1a   
#2 a           85    80   100 D2a   
#3 a          320    NA    NA NA    
#4 b           55    50   100 D1b   
#5 b          100    50   100 D1b   
#6 c           18     1    20 D1c   
#7 c           45    NA    NA NA    
#8 c          180   100   200 D3c   
#9 c          270   250   300 D4c   

根据您的预期输出,您可以使用 fuzzy_(inner_join/left_join/full_join).