strsplit 是在 R 中分隔字符串的最快方法吗
Is strsplit the fastest way to separate a string in R
我有一个字符串,"1500|3|10000|5"
,我希望有一个像这样的数字向量:
[1] 1500 3 10000 5
strsplit 比 str_extract_all
快得多。 strsplit 是最快的方法吗?
library("tidyverse")
library("microbenchmark")
x <- "1500|3|10000|5"
# mean ~ 137 microseconds
microbenchmark(
x |>
str_extract_all("\d+") |>
unlist(use.names = FALSE) |>
as.double()
)
# mean ~ 15 microseconds
microbenchmark(
x |>
strsplit(split = "\|") |>
unlist(use.names = FALSE) |>
as.double()
)
在 strsplit
中使用 fixed = TRUE
可提供更好的结果。在更大的样本上 str_extract_all
比 post.
中的 strsplit
代码执行得更好
library(stringr)
set.seed(123)
x <- paste0(sample(100000), collapse = '|')
microbenchmark::microbenchmark(
str_extract = x |>
str_extract_all("\d+") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit = x |>
strsplit(split = "\|") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit_fixed = x |>
strsplit(split = '|', fixed = TRUE) |>
unlist(use.names = FALSE) |>
as.numeric()
)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# str_extract 27.00734 28.68815 30.62537 29.62420 31.59296 55.36550 100 b
# strsplit 87.71705 91.47075 97.39022 94.99620 101.27776 123.17484 100 c
# strsplit_fixed 17.57684 20.08943 23.03720 21.59174 23.40159 49.83912 100 a
stringi
似乎稍微快一点,也应该省略管道以获得最大速度。
library(stringr)
library(stringi)
set.seed(123)
x <- paste0(sample(100000), collapse = '|')
microbenchmark::microbenchmark(
str_extract = x |>
str_extract_all("\d+") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit = x |>
strsplit(split = "\|") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit_fixed = x |>
strsplit(split = '|', fixed = TRUE) |>
unlist(use.names = FALSE) |>
as.numeric(),
stringi = as.numeric(stri_split_fixed(x, '|')[[1]]),
stringi2 = x |>
stri_split_fixed(pattern = '|') |>
unlist(use.names = FALSE) |>
as.numeric()
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# str_extract 27.5158 27.77085 28.63940 28.01650 28.32090 36.7092 100 c
# strsplit 50.6624 51.16750 52.11587 51.55955 51.98610 59.2446 100 d
# strsplit_fixed 18.9921 19.24650 20.95589 19.40140 19.68805 113.9647 100 b
# stringi 17.8246 18.13970 18.53155 18.31015 18.57825 26.4410 100 a
# stringi2 18.2519 18.64035 19.21868 18.78765 19.20105 27.1056 100 ab
我假设这个问题是因为你有一个大的管道分隔文件,你需要把它变成一个数据框。
如果您已经将文件读入字符向量x
:
x <- readLines("mydelimfile.txt")
# base R
df <- read.delim(text=x, sep="|", header=FALSE)
# with readr
df <- readr::read_delim(paste0(x, collapse="\n"), delim="|", col_names=FALSE)
但是您可以直接将文件转换为 df:
df <- read.delim("mydelimfile.txt", sep="|")
df <- readr::read_delim("mydelimfile.txt", delim="|")
我有一个字符串,"1500|3|10000|5"
,我希望有一个像这样的数字向量:
[1] 1500 3 10000 5
strsplit 比 str_extract_all
快得多。 strsplit 是最快的方法吗?
library("tidyverse")
library("microbenchmark")
x <- "1500|3|10000|5"
# mean ~ 137 microseconds
microbenchmark(
x |>
str_extract_all("\d+") |>
unlist(use.names = FALSE) |>
as.double()
)
# mean ~ 15 microseconds
microbenchmark(
x |>
strsplit(split = "\|") |>
unlist(use.names = FALSE) |>
as.double()
)
在 strsplit
中使用 fixed = TRUE
可提供更好的结果。在更大的样本上 str_extract_all
比 post.
strsplit
代码执行得更好
library(stringr)
set.seed(123)
x <- paste0(sample(100000), collapse = '|')
microbenchmark::microbenchmark(
str_extract = x |>
str_extract_all("\d+") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit = x |>
strsplit(split = "\|") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit_fixed = x |>
strsplit(split = '|', fixed = TRUE) |>
unlist(use.names = FALSE) |>
as.numeric()
)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# str_extract 27.00734 28.68815 30.62537 29.62420 31.59296 55.36550 100 b
# strsplit 87.71705 91.47075 97.39022 94.99620 101.27776 123.17484 100 c
# strsplit_fixed 17.57684 20.08943 23.03720 21.59174 23.40159 49.83912 100 a
stringi
似乎稍微快一点,也应该省略管道以获得最大速度。
library(stringr)
library(stringi)
set.seed(123)
x <- paste0(sample(100000), collapse = '|')
microbenchmark::microbenchmark(
str_extract = x |>
str_extract_all("\d+") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit = x |>
strsplit(split = "\|") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit_fixed = x |>
strsplit(split = '|', fixed = TRUE) |>
unlist(use.names = FALSE) |>
as.numeric(),
stringi = as.numeric(stri_split_fixed(x, '|')[[1]]),
stringi2 = x |>
stri_split_fixed(pattern = '|') |>
unlist(use.names = FALSE) |>
as.numeric()
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# str_extract 27.5158 27.77085 28.63940 28.01650 28.32090 36.7092 100 c
# strsplit 50.6624 51.16750 52.11587 51.55955 51.98610 59.2446 100 d
# strsplit_fixed 18.9921 19.24650 20.95589 19.40140 19.68805 113.9647 100 b
# stringi 17.8246 18.13970 18.53155 18.31015 18.57825 26.4410 100 a
# stringi2 18.2519 18.64035 19.21868 18.78765 19.20105 27.1056 100 ab
我假设这个问题是因为你有一个大的管道分隔文件,你需要把它变成一个数据框。
如果您已经将文件读入字符向量x
:
x <- readLines("mydelimfile.txt")
# base R
df <- read.delim(text=x, sep="|", header=FALSE)
# with readr
df <- readr::read_delim(paste0(x, collapse="\n"), delim="|", col_names=FALSE)
但是您可以直接将文件转换为 df:
df <- read.delim("mydelimfile.txt", sep="|")
df <- readr::read_delim("mydelimfile.txt", delim="|")