根据向量在数据框中查找部分事件

Question

我有一个数据框 a 和一个向量 b（来自另一个数据框）。现在我想在 a.
中找到向量 b 中的所有出现然而，不幸的是 vector b 有时会遗漏一个主角。

a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")

> a
#      IDENTITY_CODE    PIG_ID SEX_CODE
#1          01234567 129287133        Z
#2          65461341 120561144        Z
#3            NH1497 119265685        Z
#4            ZH0080 121883198        Z
#5            TP5146 109371743        B
#6            TP5146 109371743        B

b <- c("65461341", "1234567", "ZH0080", "TP5146")

我的预期输出是这样的：

a
#  GSN_IDENTITY_CODE    PIG_ID SEX_CODE
#1          01234567 129287133        Z
#2          65461341 120561144        Z
#4            ZH0080 121883198        Z
#5            TP5146 109371743        B

当第一次删除重复项时，它解决了一个问题，但是我仍然需要一种方法来 select 所有包含来自向量 b 的值的行，而我需要更多行：

a <- a[!duplicated(a$GSN_IDENTITY_CODE),]

不幸的是，我不能使用 %in%，因为它会重复并错过第一行，因为它不接受正则表达式':

> a[a$GSN_IDENTITY_CODE %in% b,]
#  GSN_IDENTITY_CODE    PIG_ID SEX_CODE
#2          65461341 120561144        Z
#4            ZH0080 121883198        Z
#5            TP5146 109371743        B
#6            TP5146 109371743        B

使用 data.table 的 %like% 仅适用于向量 b

中的第一个字符串

library(data.table)
> setDT(a)
> a[a$GSN_IDENTITY_CODE %like% b,]
#   GSN_IDENTITY_CODE    PIG_ID SEX_CODE
#1:          65461341 120561144        Z
Warning message:
In grepl(pattern, vector) :
  argument 'pattern' has length > 1 and only the first element will be used

R中有支持我这里需求的函数吗？

@Frank 的尝试产生了以下错误：

a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")

b <- c("65461341", "1234567", "ZH0080", "TP5146")

> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in `[.data.frame`(a, .(b), on = "GSN_IDENTITY_CODE", nomatch = FALSE,  : 
  unused arguments (on = "GSN_IDENTITY_CODE", nomatch = FALSE, mult = "first")
> setDT(a)
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch,  : 
  x.'GSN_IDENTITY_CODE' is a character column being joined to i.'NA' which is type 'NULL'. Character columns must join to factor or character columns.

Answer 1

如果额外字符可能出现在字符串中的任何位置，您可以对接近的匹配执行类似的操作：

library(stringdist)
library(purrr)


a$closest_match <- map(a$GSN_IDENTITY_CODE, ~stringdist(., b, method = "lv")) %>% 
  map_dbl(min)
a[a$closest_match < 2, ]

如果额外的字符总是在开头，我会这样做：

library(stringr)

a$stripped_code <- str_replace(a$GSN_IDENTITY_CODE,"^\d", "")

a$keep <- a$GSN_IDENTITY_CODE %in% b | a$stripped_code %in% b
a[a$keep, ]

根据向量在数据框中查找部分事件

Find partial occurences in data frame based on a vector

regex

r

dataframe

sql-like

data.table