R - 是否可以优化或简化对 grepl() 的多次调用?
R - Is it possible to optimize or streamline multiple calls to grepl()?
使用 NOAA Severe Weather data,其中包含描述天气事件类型的变量 EVTYPE
(事件类型)。这些值包括许多同义词,我想在几个更广泛的名称下收集这些同义词。例如,有 TORNADO
但也有 ROTATING WALL CLOUD
、FUNNEL CLOUD
和 WHIRLWIND
,它们在某种意义上描述了 相对 相似的事件。在不深入了解气象学的微妙之处的情况下,我想将这些几乎同义的值组合在一个值名称下。
假设我已将数据集加载到数据框中 noaa_clean
并且我应用了这个:
tornado <- sapply(as.character(noaa_clean$EVTYPE),
function(x){grepl("^.*TORNAD.*$", x) |
grepl("^.*SPOUT.*$", x) |
grepl("^.*WHIRL.*$", x) |
grepl("^.*FUNNEL.*$", x) |
grepl("^.*ROTATING WALL CLOUD.*$", x) |
grepl("^.*DUST DEVIL.*$", x)})
noaa_clean[tornado, "EVCAT"] <- "TORNADO"; rm(tornado)
它运行良好,但我有几个,需要一些时间(~5-10 分钟)才能 运行 所有。我的问题是:有没有更好的方法来利用 grepl()
或正则表达式来提高效率?
正则表达式本身可以使用 |
作为 OR 匹配。你可以这样做
tornado <- grepl("(TORNAD|SPOUT|WHIRL|FUNNEL|ROTATING WALL CLOUD|DUST DEVIL)", as.character(noaa_clean$EVTYPE))
另请注意,我们不需要使用 sapply()
,因为 grepl
已经是 R 中的矢量化函数。
由于您专门询问了速度,因此对评论中发布的各种解决方案或作为答案进行的测试是:
#Initialize vector
x <- sample(c("TORNA", "SPOUT", "WHIRL", "FUNNEL", "ROTATING WALL CLOUD", "DUST DEVIL",
LETTERS[1:8]), 1e6, replace = TRUE)
#Using separate grepl's
multi_grepl <- function(x) {grepl("TORNAD", x) |grepl("SPOUT", x) |grepl("WHIRL", x) |grepl("FUNNEL", x) | grepl("ROTATING WALL CLOUD", x) |grepl("DUST DEVIL", x)}
#One grepl
one_grepl <- function(x) grepl("TORNAD|SPOUT|WHIRL|FUNNEL|ROTATING WALL CLOUD|DUST DEVIL", x)
#Using stri_detect_regex
detect_regex <- function(x) stringi::stri_detect_regex(x, "TORNAD|SPOUT|WHIRL|FUNNEL|ROTATING WALL CLOUD|DUST DEVIL")
#Original solution with sapply
orig_sapply <- function(x) sapply(x, function(y){grepl("^.*TORNAD.*$", y) |grepl("^.*SPOUT.*$", y) |grepl("^.*WHIRL.*$", y) |grepl("^.*FUNNEL.*$", y) |grepl("^.*ROTATING WALL CLOUD.*$", y) |grepl("^.*DUST DEVIL.*$", y)})
#Using stri_detect_fixed
stri_fixed = function(x) { stri_detect_fixed(x, pattern = "TORNAD") | stri_detect_fixed(x, pattern = "SPOUT") | stri_detect_fixed(x, pattern = "WHIRL") | stri_detect_fixed(x, pattern = "FUNNEL") | stri_detect_fixed(x, pattern = "ROTATING WALL CLOUD") | stri_detect_fixed(x, pattern = "DUST DEVIL") }
#Checking that all these give same answer
identical(multi_grepl(x), one_grepl(x), detect_regex(x), orig_sapply(x), stri_fixed(x))
#[1] TRUE
microbenchmark::microbenchmark(multi_grepl(x),
one_grepl(x),
detect_regex(x),
orig_sapply(x),
stri_fixed(x), times = 20L)
#Unit: milliseconds
# expr min lq mean median uq max neval
# multi_grepl(x) 724.6716 738.5227 754.2347 747.1441 769.2897 819.9971 20
# one_grepl(x) 406.7987 410.3197 420.0083 412.1168 426.5932 453.2471 20
# detect_regex(x) 167.4844 170.0834 174.1256 172.7410 177.1546 187.3211 20
# orig_sapply(x) 47172.3407 47379.8250 47666.7177 47546.2221 47875.9352 48517.2228 20
# stri_fixed(x) 261.4303 265.9189 270.5816 268.6038 273.2486 288.7071 20
看来 stri_detect_regex
是最快的。有趣的是,这与我在 regex
中有 ^.*
和 .*$
时尝试的最后一次迭代不同。感谢@Gregor 指出这一点。请注意,您的原始 sapply
非常慢,因为它多次执行 grepl
搜索(每个元素一次)。而不是整个向量只有一次。
最后,较长的单个字符串的结果:
prefixes <- replicate(1e6, paste0(sample(LETTERS, sample(100:200), replace = TRUE), collapse = ""))
suffixes <- replicate(1e6, paste0(sample(LETTERS, sample(200:300), replace = TRUE), collapse = ""))
x_long <- paste0(prefixes, x, suffixes)
microbenchmark::microbenchmark(multi_grepl(x_long),
one_grepl(x_long),
detect_regex(x_long),
stri_fixed(x_long), times = 20L)
#Unit: seconds
# expr min lq mean median uq max neval
# multi_grepl(x_long) 27.654274 27.721042 28.194273 27.962656 28.626697 29.909105 20
# one_grepl(x_long) 11.478831 11.510868 11.775088 11.583650 11.663479 14.318680 20
# detect_regex(x_long) 8.673534 8.729508 8.808797 8.774432 8.878907 9.028005 20
# stri_fixed(x_long) 4.502196 4.540850 4.609050 4.591879 4.690035 4.750445 20
使用 NOAA Severe Weather data,其中包含描述天气事件类型的变量 EVTYPE
(事件类型)。这些值包括许多同义词,我想在几个更广泛的名称下收集这些同义词。例如,有 TORNADO
但也有 ROTATING WALL CLOUD
、FUNNEL CLOUD
和 WHIRLWIND
,它们在某种意义上描述了 相对 相似的事件。在不深入了解气象学的微妙之处的情况下,我想将这些几乎同义的值组合在一个值名称下。
假设我已将数据集加载到数据框中 noaa_clean
并且我应用了这个:
tornado <- sapply(as.character(noaa_clean$EVTYPE),
function(x){grepl("^.*TORNAD.*$", x) |
grepl("^.*SPOUT.*$", x) |
grepl("^.*WHIRL.*$", x) |
grepl("^.*FUNNEL.*$", x) |
grepl("^.*ROTATING WALL CLOUD.*$", x) |
grepl("^.*DUST DEVIL.*$", x)})
noaa_clean[tornado, "EVCAT"] <- "TORNADO"; rm(tornado)
它运行良好,但我有几个,需要一些时间(~5-10 分钟)才能 运行 所有。我的问题是:有没有更好的方法来利用 grepl()
或正则表达式来提高效率?
正则表达式本身可以使用 |
作为 OR 匹配。你可以这样做
tornado <- grepl("(TORNAD|SPOUT|WHIRL|FUNNEL|ROTATING WALL CLOUD|DUST DEVIL)", as.character(noaa_clean$EVTYPE))
另请注意,我们不需要使用 sapply()
,因为 grepl
已经是 R 中的矢量化函数。
由于您专门询问了速度,因此对评论中发布的各种解决方案或作为答案进行的测试是:
#Initialize vector
x <- sample(c("TORNA", "SPOUT", "WHIRL", "FUNNEL", "ROTATING WALL CLOUD", "DUST DEVIL",
LETTERS[1:8]), 1e6, replace = TRUE)
#Using separate grepl's
multi_grepl <- function(x) {grepl("TORNAD", x) |grepl("SPOUT", x) |grepl("WHIRL", x) |grepl("FUNNEL", x) | grepl("ROTATING WALL CLOUD", x) |grepl("DUST DEVIL", x)}
#One grepl
one_grepl <- function(x) grepl("TORNAD|SPOUT|WHIRL|FUNNEL|ROTATING WALL CLOUD|DUST DEVIL", x)
#Using stri_detect_regex
detect_regex <- function(x) stringi::stri_detect_regex(x, "TORNAD|SPOUT|WHIRL|FUNNEL|ROTATING WALL CLOUD|DUST DEVIL")
#Original solution with sapply
orig_sapply <- function(x) sapply(x, function(y){grepl("^.*TORNAD.*$", y) |grepl("^.*SPOUT.*$", y) |grepl("^.*WHIRL.*$", y) |grepl("^.*FUNNEL.*$", y) |grepl("^.*ROTATING WALL CLOUD.*$", y) |grepl("^.*DUST DEVIL.*$", y)})
#Using stri_detect_fixed
stri_fixed = function(x) { stri_detect_fixed(x, pattern = "TORNAD") | stri_detect_fixed(x, pattern = "SPOUT") | stri_detect_fixed(x, pattern = "WHIRL") | stri_detect_fixed(x, pattern = "FUNNEL") | stri_detect_fixed(x, pattern = "ROTATING WALL CLOUD") | stri_detect_fixed(x, pattern = "DUST DEVIL") }
#Checking that all these give same answer
identical(multi_grepl(x), one_grepl(x), detect_regex(x), orig_sapply(x), stri_fixed(x))
#[1] TRUE
microbenchmark::microbenchmark(multi_grepl(x),
one_grepl(x),
detect_regex(x),
orig_sapply(x),
stri_fixed(x), times = 20L)
#Unit: milliseconds
# expr min lq mean median uq max neval
# multi_grepl(x) 724.6716 738.5227 754.2347 747.1441 769.2897 819.9971 20
# one_grepl(x) 406.7987 410.3197 420.0083 412.1168 426.5932 453.2471 20
# detect_regex(x) 167.4844 170.0834 174.1256 172.7410 177.1546 187.3211 20
# orig_sapply(x) 47172.3407 47379.8250 47666.7177 47546.2221 47875.9352 48517.2228 20
# stri_fixed(x) 261.4303 265.9189 270.5816 268.6038 273.2486 288.7071 20
看来 stri_detect_regex
是最快的。有趣的是,这与我在 regex
中有 ^.*
和 .*$
时尝试的最后一次迭代不同。感谢@Gregor 指出这一点。请注意,您的原始 sapply
非常慢,因为它多次执行 grepl
搜索(每个元素一次)。而不是整个向量只有一次。
最后,较长的单个字符串的结果:
prefixes <- replicate(1e6, paste0(sample(LETTERS, sample(100:200), replace = TRUE), collapse = ""))
suffixes <- replicate(1e6, paste0(sample(LETTERS, sample(200:300), replace = TRUE), collapse = ""))
x_long <- paste0(prefixes, x, suffixes)
microbenchmark::microbenchmark(multi_grepl(x_long),
one_grepl(x_long),
detect_regex(x_long),
stri_fixed(x_long), times = 20L)
#Unit: seconds
# expr min lq mean median uq max neval
# multi_grepl(x_long) 27.654274 27.721042 28.194273 27.962656 28.626697 29.909105 20
# one_grepl(x_long) 11.478831 11.510868 11.775088 11.583650 11.663479 14.318680 20
# detect_regex(x_long) 8.673534 8.729508 8.808797 8.774432 8.878907 9.028005 20
# stri_fixed(x_long) 4.502196 4.540850 4.609050 4.591879 4.690035 4.750445 20