加速匹配固定字符串 %in% / %like% 与布尔输出
Speedup matching fixed strings %in% / %like% with boolean output
我想要一个逻辑向量来指示第二个列表中是否存在匹配项。如果您需要完全匹配,可以使用 %in%
运算符,但我对任何匹配都感兴趣,所以我创建了 %like%
运算符:
table <- rownames(mtcars)
table
#> [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
#> [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
#> [7] "Duster 360" "Merc 240D" "Merc 230"
#> [10] "Merc 280" "Merc 280C" "Merc 450SE"
#> [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
#> [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
#> [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
#> [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
#> [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
#> [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
#> [31] "Maserati Bora" "Volvo 142E"
x <- c('Porsche', 'Porsche 914-2', 'Porsche 911', 'Volvo')
x %in% table
#> [1] FALSE TRUE FALSE FALSE
"%like%" <- function(x, table) sapply(x, function(x)
sum(grepl(pattern = x, x = table))>0, USE.NAMES = FALSE)
x %like% table
#> [1] TRUE TRUE FALSE TRUE
不幸的是,%like%
运算符非常慢:
library(microbenchmark)
x1 <- c('Porsche', 'Porsche 914-2', 'Porsche 911', 'Volvo')
x2 <- rep(x1, 10)
x3 <- rep(x1, 100)
table <- rownames(mtcars)
"%like%" <- function(x, table) sapply(x, function(x)
sum(grepl(pattern = x, x = table))>0, USE.NAMES = FALSE)
microbenchmark(x1 %in% table, x1 %like% table, times = 1000)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> x1 %in% table 1.549 1.8635 2.248905 2.2545 2.5000 7.331 1000
#> x1 %like% table 69.697 71.2110 73.235948 72.6555 74.0835 149.087 1000
microbenchmark(x2 %in% table, x2 %like% table, times = 1000)
#> Unit: microseconds
#> expr min lq mean median uq max
#> x2 %in% table 2.327 2.8795 3.330329 3.3055 3.6515 7.539
#> x2 %like% table 573.005 581.0885 590.760082 584.2270 588.2580 1624.687
#> neval
#> 1000
#> 1000
microbenchmark(x3 %in% table, x3 %like% table, times = 1000)
#> Unit: microseconds
#> expr min lq mean median uq max
#> x3 %in% table 9.195 9.950 11.79078 10.923 12.5675 36.341
#> x3 %like% table 5612.931 5707.168 5973.83801 5737.892 5823.7875 11868.495
#> neval
#> 1000
#> 1000
如何加速 %like%
运算符?
如果您不介意完全匹配,可以在 grepl
中使用 fixed = T
来加快速度
"%birger%" <- function(x, table) sapply(x, function(x)
sum(grepl(pattern = x, x = table))>0, USE.NAMES = FALSE)
'%birger.fixed%' <- function(x, table) sapply(x, function(x)
any(grepl(pattern = x, x = table, fixed = T)), USE.NAMES = FALSE)
all.equal(x %birger.fixed% table, x %birger% table)
# [1] TRUE
microbenchmark(x %birger.fixed% table, x %birger% table, times = 1000, unit = 'relative')
# Unit: relative
# expr min lq mean median uq max neval
# x %birger.fixed% table 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1000
# x %birger% table 2.059546 2.011009 1.903589 1.913446 1.857798 1.336424 1000
我想要一个逻辑向量来指示第二个列表中是否存在匹配项。如果您需要完全匹配,可以使用 %in%
运算符,但我对任何匹配都感兴趣,所以我创建了 %like%
运算符:
table <- rownames(mtcars)
table
#> [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
#> [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
#> [7] "Duster 360" "Merc 240D" "Merc 230"
#> [10] "Merc 280" "Merc 280C" "Merc 450SE"
#> [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
#> [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
#> [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
#> [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
#> [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
#> [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
#> [31] "Maserati Bora" "Volvo 142E"
x <- c('Porsche', 'Porsche 914-2', 'Porsche 911', 'Volvo')
x %in% table
#> [1] FALSE TRUE FALSE FALSE
"%like%" <- function(x, table) sapply(x, function(x)
sum(grepl(pattern = x, x = table))>0, USE.NAMES = FALSE)
x %like% table
#> [1] TRUE TRUE FALSE TRUE
不幸的是,%like%
运算符非常慢:
library(microbenchmark)
x1 <- c('Porsche', 'Porsche 914-2', 'Porsche 911', 'Volvo')
x2 <- rep(x1, 10)
x3 <- rep(x1, 100)
table <- rownames(mtcars)
"%like%" <- function(x, table) sapply(x, function(x)
sum(grepl(pattern = x, x = table))>0, USE.NAMES = FALSE)
microbenchmark(x1 %in% table, x1 %like% table, times = 1000)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> x1 %in% table 1.549 1.8635 2.248905 2.2545 2.5000 7.331 1000
#> x1 %like% table 69.697 71.2110 73.235948 72.6555 74.0835 149.087 1000
microbenchmark(x2 %in% table, x2 %like% table, times = 1000)
#> Unit: microseconds
#> expr min lq mean median uq max
#> x2 %in% table 2.327 2.8795 3.330329 3.3055 3.6515 7.539
#> x2 %like% table 573.005 581.0885 590.760082 584.2270 588.2580 1624.687
#> neval
#> 1000
#> 1000
microbenchmark(x3 %in% table, x3 %like% table, times = 1000)
#> Unit: microseconds
#> expr min lq mean median uq max
#> x3 %in% table 9.195 9.950 11.79078 10.923 12.5675 36.341
#> x3 %like% table 5612.931 5707.168 5973.83801 5737.892 5823.7875 11868.495
#> neval
#> 1000
#> 1000
如何加速 %like%
运算符?
如果您不介意完全匹配,可以在 grepl
中使用 fixed = T
来加快速度
"%birger%" <- function(x, table) sapply(x, function(x)
sum(grepl(pattern = x, x = table))>0, USE.NAMES = FALSE)
'%birger.fixed%' <- function(x, table) sapply(x, function(x)
any(grepl(pattern = x, x = table, fixed = T)), USE.NAMES = FALSE)
all.equal(x %birger.fixed% table, x %birger% table)
# [1] TRUE
microbenchmark(x %birger.fixed% table, x %birger% table, times = 1000, unit = 'relative')
# Unit: relative
# expr min lq mean median uq max neval
# x %birger.fixed% table 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1000
# x %birger% table 2.059546 2.011009 1.903589 1.913446 1.857798 1.336424 1000