根据向量在数据框中查找部分事件
Find partial occurences in data frame based on a vector
我有一个数据框 a
和一个向量 b
(来自另一个数据框)。现在我想在 a
.
中找到向量 b
中的所有出现
然而,不幸的是 vector b
有时会遗漏一个主角。
a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")
> a
# IDENTITY_CODE PIG_ID SEX_CODE
#1 01234567 129287133 Z
#2 65461341 120561144 Z
#3 NH1497 119265685 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
#6 TP5146 109371743 B
b <- c("65461341", "1234567", "ZH0080", "TP5146")
我的预期输出是这样的:
a
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#1 01234567 129287133 Z
#2 65461341 120561144 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
当第一次删除重复项时,它解决了一个问题,但是我仍然需要一种方法来 select 所有包含来自向量 b
的值的行,而我需要更多行:
a <- a[!duplicated(a$GSN_IDENTITY_CODE),]
不幸的是,我不能使用 %in%
,因为它会重复并错过第一行,因为它不接受正则表达式':
> a[a$GSN_IDENTITY_CODE %in% b,]
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#2 65461341 120561144 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
#6 TP5146 109371743 B
使用 data.table
的 %like%
仅适用于向量 b
中的第一个字符串
library(data.table)
> setDT(a)
> a[a$GSN_IDENTITY_CODE %like% b,]
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#1: 65461341 120561144 Z
Warning message:
In grepl(pattern, vector) :
argument 'pattern' has length > 1 and only the first element will be used
R中有支持我这里需求的函数吗?
@Frank 的尝试产生了以下错误:
a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")
b <- c("65461341", "1234567", "ZH0080", "TP5146")
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in `[.data.frame`(a, .(b), on = "GSN_IDENTITY_CODE", nomatch = FALSE, :
unused arguments (on = "GSN_IDENTITY_CODE", nomatch = FALSE, mult = "first")
> setDT(a)
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, :
x.'GSN_IDENTITY_CODE' is a character column being joined to i.'NA' which is type 'NULL'. Character columns must join to factor or character columns.
如果额外字符可能出现在字符串中的任何位置,您可以对接近的匹配执行类似的操作:
library(stringdist)
library(purrr)
a$closest_match <- map(a$GSN_IDENTITY_CODE, ~stringdist(., b, method = "lv")) %>%
map_dbl(min)
a[a$closest_match < 2, ]
如果额外的字符总是在开头,我会这样做:
library(stringr)
a$stripped_code <- str_replace(a$GSN_IDENTITY_CODE,"^\d", "")
a$keep <- a$GSN_IDENTITY_CODE %in% b | a$stripped_code %in% b
a[a$keep, ]
我有一个数据框 a
和一个向量 b
(来自另一个数据框)。现在我想在 a
.
中找到向量 b
中的所有出现
然而,不幸的是 vector b
有时会遗漏一个主角。
a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")
> a
# IDENTITY_CODE PIG_ID SEX_CODE
#1 01234567 129287133 Z
#2 65461341 120561144 Z
#3 NH1497 119265685 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
#6 TP5146 109371743 B
b <- c("65461341", "1234567", "ZH0080", "TP5146")
我的预期输出是这样的:
a
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#1 01234567 129287133 Z
#2 65461341 120561144 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
当第一次删除重复项时,它解决了一个问题,但是我仍然需要一种方法来 select 所有包含来自向量 b
的值的行,而我需要更多行:
a <- a[!duplicated(a$GSN_IDENTITY_CODE),]
不幸的是,我不能使用 %in%
,因为它会重复并错过第一行,因为它不接受正则表达式':
> a[a$GSN_IDENTITY_CODE %in% b,]
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#2 65461341 120561144 Z
#4 ZH0080 121883198 Z
#5 TP5146 109371743 B
#6 TP5146 109371743 B
使用 data.table
的 %like%
仅适用于向量 b
library(data.table)
> setDT(a)
> a[a$GSN_IDENTITY_CODE %like% b,]
# GSN_IDENTITY_CODE PIG_ID SEX_CODE
#1: 65461341 120561144 Z
Warning message:
In grepl(pattern, vector) :
argument 'pattern' has length > 1 and only the first element will be used
R中有支持我这里需求的函数吗?
@Frank 的尝试产生了以下错误:
a <- structure(list(GSN_IDENTITY_CODE = c("01234567", "65461341", "NH1497", "ZH0080", "TP5146", "TP5146"), PIG_ID = c("129287133", "120561144", "119265685", "121883198", "109371743", "109371743" ), SEX_CODE = c("Z", "Z", "Z", "Z", "B", "B")), .Names = c("GSN_IDENTITY_CODE", "PIG_ID", "SEX_CODE"), row.names = c(NA, 6L), class = "data.frame")
b <- c("65461341", "1234567", "ZH0080", "TP5146")
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in `[.data.frame`(a, .(b), on = "GSN_IDENTITY_CODE", nomatch = FALSE, :
unused arguments (on = "GSN_IDENTITY_CODE", nomatch = FALSE, mult = "first")
> setDT(a)
> a[.(b), on="GSN_IDENTITY_CODE", nomatch=FALSE, mult="first"]
Error in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch, :
x.'GSN_IDENTITY_CODE' is a character column being joined to i.'NA' which is type 'NULL'. Character columns must join to factor or character columns.
如果额外字符可能出现在字符串中的任何位置,您可以对接近的匹配执行类似的操作:
library(stringdist)
library(purrr)
a$closest_match <- map(a$GSN_IDENTITY_CODE, ~stringdist(., b, method = "lv")) %>%
map_dbl(min)
a[a$closest_match < 2, ]
如果额外的字符总是在开头,我会这样做:
library(stringr)
a$stripped_code <- str_replace(a$GSN_IDENTITY_CODE,"^\d", "")
a$keep <- a$GSN_IDENTITY_CODE %in% b | a$stripped_code %in% b
a[a$keep, ]