在 R 中从字符串中查找姓氏的最快方法

Fastest Way To Find Last Names From String in R

我正在尝试从 R 中各种格式的部分姓名字符串中识别可能的姓氏。识别从姓氏数据集到给定姓名字符串的最长字符串匹配的最快方法是什么(我是使用 wru surnames2010 数据集)?

我需要最长的可能性,而不是任何可能性。 IE。在下面的示例中,第一个字符串“scottcampbell”包含可能的姓氏“scott”和“campbell”。我只想 return 最长的可能匹配,在这种情况下只有“坎贝尔”。

重现示例数据:

library(wru)
data("surnames2010")
#filter out names under 4 characters
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")

想要想象的函数+结果:

foo_longest_matches(testvec)
#Desired imagined result:
[1] "campbell" "baker" "smith" "watkins" "burns" "terri" "rodriguez" "neal")

您可以使用 adist。请注意,您正在进行超过 100 万次比较以获得最长的比较。我希望你使用不同的方法。到目前为止我想到的最好的是

a <- adist(toupper(testvec), surnames2010$surname, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(surnames2010$surname[max.col(-d)])
[1] "CAMPBELL"  "BAKER"     "SMITH"     "WATKINS"   "BURNS"     "TERRI"     "RODRIGUEZ" "NEAL" 

基准:

longest <- function(testvec,namevec){
  a <- adist(testvec, namevec, counts = TRUE)
  b <- attr(a, "trafos")
  d <- array(grepl("S|I", b) + nchar(gsub("(.)\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
  as.character(namevec[max.col(-d)])
}

编辑:能够获得更快的方法(不一定是最快的)

longest2 <- function(testvec,namevec){
  a <- stack(sapply(namevec,grep,testvec,value = TRUE,simplify = FALSE))
  tapply(as.character(a[, 2]), a[, 1], function(x) x[which.max(nchar(x))])[testvec]
}


microbenchmark::microbenchmark(longest(testvec,lnames$surname),longest2(testvec,lnames$surname),foo_longest_matches(testvec),times = 5)
Unit: seconds
                              expr       min        lq      mean    median        uq       max neval
  longest(testvec, lnames$surname)  3.316550  3.984128  5.308339  6.265192  6.396348  6.579477     5
 longest2(testvec, lnames$surname)  1.817059  1.917883  2.835354  3.350068  3.538278  3.553481     5
      foo_longest_matches(testvec) 10.093179 10.325489 11.610619 10.756714 10.889326 15.988384     5

不确定最快的速度,但这里有一个测试方法:

library(wru)
data("surnames2010")
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")

lnames$surname <- tolower(lnames$surname)
testvec <- tolower(testvec)

foo_longest_matches <- function(string_vector) {
  outdf <- c()
  for (name in string_vector) {
    print(name)
    ting <- lnames[sapply(lnames$surname, function(x) grepl(x, name)),]
    # you only care about the longest, remove the next line to get all matches
    ting <- ting[which.max(nchar(ting$surname)),]
    outdf <- rbind(outdf, ting)
  }
  return(outdf)
}

get_matches <- foo_longest_matches(testvec)
get_matches
#          surname  p_whi  p_bla      p_his      p_asi      p_oth
# 47      campbell 0.7366 0.2047 0.02490000 0.00530000 0.02840000
# 44         baker 0.7983 0.1444 0.02280000 0.00560000 0.02890000
# 1          smith 0.7090 0.2311 0.02400000 0.00500000 0.03080000
# 240      watkins 0.6203 0.3227 0.02090000 0.00420000 0.03200000
# 155        burns 0.8026 0.1406 0.02480000 0.00590000 0.02610000
# 110133     terri 0.7453 0.1801 0.01243333 0.01243333 0.04973333
# 9      rodriguez 0.0475 0.0054 0.93770000 0.00570000 0.00360000
# 337         neal 0.6210 0.3184 0.02160000 0.00600000 0.03290000