捕获正则表达式的更快方法

Question

我想使用正则表达式来捕获子字符串 - 我已经有了一个可行的解决方案，但我想知道是否有更快的解决方案。我正在对包含大约 400.000 个条目的向量应用 applyCaptureRegex。

 exampleData <- as.data.frame(c("[hg19:21:34809787-34809808:+]","[hg19:11:105851118-105851139:+]","[hg19:17:7482245-7482266:+]","[hg19:6:19839915-19839936:+]"))

captureRegex <- function(captRegEx,str){
  sapply(regmatches(str,gregexpr(captRegEx,str))[[1]], function(m) regmatches(m,regexec(captRegEx,m)))
}

applyCaptureRegex <- function(mir,r){
  mir <- unlist(apply(mir, 1, function(x) captureRegex(r,x[1])))
  mir <- matrix(mir ,ncol=5, byrow = TRUE)
  mir
}

用法和结果：

> captureRegex("\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\]","[hg19:12:125627828-125627847:-]")
$`[hg19:12:125627828-125627847:-]`
[1] "[hg19:12:125627828-125627847:-]" "12" "125627828" "125627847" "-"   

> applyCaptureRegex(exampleData,"\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\]")
     [,1]                              [,2] [,3]        [,4]        [,5]
[1,] "[hg19:21:34809787-34809808:+]"   "21" "34809787"  "34809808"  "+" 
[2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+" 
[3,] "[hg19:17:7482245-7482266:+]"     "17" "7482245"   "7482266"   "+" 
[4,] "[hg19:6:19839915-19839936:+]"    "6"  "19839915"  "19839936"  "+"

谢谢！

Answer 1

为什么要重新发明轮子？您有多个库包可供选择，其功能 return 一个字符矩阵，其中一列用于模式中的每个捕获组。

stri_match_all_regex — stringi

x <- c('[hg19:21:34809787-34809808:+]', '[hg19:11:105851118-105851139:+]', '[hg19:17:7482245-7482266:+]', '[hg19:6:19839915-19839936:+]')
do.call(rbind, stri_match_all_regex(x, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]'))
#      [,1]                              [,2] [,3]        [,4]        [,5]
# [1,] "[hg19:21:34809787-34809808:+]"   "21" "34809787"  "34809808"  "+" 
# [2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+" 
# [3,] "[hg19:17:7482245-7482266:+]"     "17" "7482245"   "7482266"   "+" 
# [4,] "[hg19:6:19839915-19839936:+]"    "6"  "19839915"  "19839936"  "+"

str_match — stringr

str_match(x, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]')

strapplyc — gsubfn

strapplyc(x, "(\[[^:]+:(\d+):(\d+)-(\d+):([-+])])", simplify = rbind)

以下是所有组合解决方案的基准比较。

x <- rep(c('[hg19:21:34809787-34809808:+]', 
           '[hg19:11:105851118-105851139:+]', 
           '[hg19:17:7482245-7482266:+]', 
           '[hg19:6:19839915-19839936:+]'), 1000)

applyCaptureRegex <- function(mir, r) {
  do.call(rbind, lapply(mir, function(x) regmatches(x, regexec(r, x))[[1]]))
}

gsubfn <- function(x1) strapplyc(x1, '(\[[^:]+:(\d+):(\d+)-(\d+):([-+])])', simplify = rbind)
regmtch <- function(x1) applyCaptureRegex(x1, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]')
stringr <- function(x1) str_match(x1, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]')
stringi <- function(x1) do.call(rbind, stri_match_all_regex(x1, '\[[^:]+:(\d+):(\d+)-(\d+):([-+])]'))

require(microbenchmark)
microbenchmark(gsubfn(x), regmtch(x), stringr(x), stringi(x))

结果

Unit: milliseconds
       expr       min        lq      mean    median        uq       max neval
  gsubfn(x) 372.27072 382.82179 391.21837 388.32396 396.27361 449.03091   100
 regmtch(x) 394.03164 409.87523 419.42936 417.76770 427.08208 456.92460   100
 stringr(x)  65.81644  70.28327  76.02298  75.43162  78.92567 116.18026   100
 stringi(x)  15.88171  16.53047  17.52434  16.96127  17.76007  23.94449   100

捕获正则表达式的更快方法

Faster way to capture regex

regex

performance

r

apply