在 R 中解析 "continent / country / city" 向量的快速方法
Fast way to parse vector of "continent / country / city" in R
我在 R 中有一个字符向量,每个字符串由“大陆/国家/城市”组成,例如
x=rep("Africa / Kenya / Nairobi", 1000000)
但是“/”偶尔会在没有括号空格的情况下被误输入为“/”,在某些情况下城市也会丢失,例如是“非洲/肯尼亚”,没有城市。
我想将其解析为三个向量大陆、国家和城市,如果缺少城市,则使用 NA。
对于国家我现在做了类似
country = sapply(x, function(loc) trimws(strsplit(loc,"/", fixed = TRUE)[[1]][2]))
但是如果向量 x 很长,那会很慢。在 R 中解析它的有效方法是什么?
您可以在 do.call
中尝试 rbind
。在 lapply
中使用 [
是为了得到 3 个结果,以防城市丢失。
x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")
y <- do.call(rbind, lapply(strsplit(x, "/", TRUE), "[", 1:3))
y <- trimws(y, whitespace = " ")
y
# [,1] [,2] [,3]
#[1,] "Africa" "Kenya" "Nairobi"
#[2,] "Africa" "Kenya" "Nairobi"
#[3,] "Africa" "Kenya" NA
或使用data.table
:
x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")
y <- do.call(cbind, data.table::tstrsplit(x, "/", TRUE))
y <- trimws(y, whitespace = " ")
y
# [,1] [,2] [,3]
#[1,] "Africa" "Kenya" "Nairobi"
#[2,] "Africa" "Kenya" "Nairobi"
#[3,] "Africa" "Kenya" NA
基准
#x <- rep("Africa / Kenya / Nairobi", 1000000) #Timings will depend on the used dataset
n <- 1e6L
f1 <- function(n) replicate(n, paste(sample(letters, sample(5:15, 1), TRUE), collapse = ""))
f2 <- function(n) sample(c("/", " /", "/ ", " / "), n, TRUE)
set.seed(42)
x <- paste0(f1(n), f2(n), f1(n), sample(c(paste0(f2(n%/%2L), f1(n%/%2L)), rep("", n - n%/%2L))))
system.time( #Method given in the question
sapply(x, function(loc) trimws(strsplit(loc,"/", fixed = TRUE)[[1]][2])))
# User System verstrichen
# 47.718 0.004 47.798
system.time( #Using strsplit and trimws
trimws(do.call(rbind, lapply(strsplit(x, "/", TRUE), "[", 1:3)), whitespace = " "))
# User System verstrichen
# 5.446 0.008 5.454
system.time( #Using data.table::tstrsplit and trimws
trimws(do.call(cbind, data.table::tstrsplit(x, "/", TRUE)), whitespace = " "))
# User System verstrichen
# 2.365 0.012 2.376
system.time( #Using readr::read_delim from @Anoushiravan R
readr::read_delim(x, delim = "/", quote = "", trim_ws = TRUE, col_names = FALSE))
# User System verstrichen
# 1.961 0.024 2.222
system.time( #Using data.table::tstrsplit with " */ *"
do.call(cbind, data.table::tstrsplit(x, " */ *", perl=TRUE)))
# User System verstrichen
# 1.394 0.000 1.394
system.time( #Using read.table from @akrun
read.table(text = x, sep = "/", header = FALSE, fill = TRUE, strip.white = TRUE, na.strings = ""))
# User System verstrichen
# 1.298 0.004 1.302
system.time( #Using data.table::fread from @akrun
data.table::fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
# User System verstrichen
# 1.146 0.016 0.996
system.time( #Using read.table with additional argiments
read.table(text = x, sep = "/", header = FALSE, fill = TRUE, strip.white = TRUE, na.strings = "", nrows=length(x), comment.char = "", colClasses = c("character")))
# User System verstrichen
# 1.076 0.000 1.076
system.time( #Using data.table::fread with stringr::str_c (or stringi::stri_c)
data.table::fread(text = stringr::str_c(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
# User System verstrichen
# 0.780 0.000 0.624
使用 data.table::fread
并使用 stringr::str_c
创建输入字符串看起来是目前给定方法中最快的。
考虑使用 base R
中的 read.table
read.table(text = x, sep = "/", header = FALSE,
fill = TRUE, strip.white = TRUE, na.strings = "")
V1 V2 V3
1 Africa Kenya Nairobi
2 Africa Kenya Nairobi
3 Africa Kenya <NA>
或使用 fread
来自 data.table
library(data.table)
fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = "")
Africa Kenya Nairobi
1: Africa Kenya Nairobi
2: Africa Kenya <NA>
基准
x <- rep("Africa / Kenya / Nairobi", 1000000)
>
> system.time(fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
user system elapsed
0.473 0.024 0.496
> system.time(read.table(text = x, sep = "/", header = FALSE,
+ fill = TRUE, strip.white = TRUE, na.strings = ""))
user system elapsed
0.519 0.026 0.543
> system.time({ #Using data.table
+ y <- do.call(cbind, data.table::tstrsplit(x, "/", TRUE))
+ y <- trimws(y, whitespace = " ")
+ })
user system elapsed
2.035 0.051 2.067
数据
x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")
我觉得这个也可以用:
library(readr)
xx <- readr::read_delim(b, delim = "/", quote = "", trim_ws = TRUE, col_names = FALSE)
# A tibble: 3 x 3
X1 X2 X3
<chr> <chr> <chr>
1 Africa Kenya Nairobi
2 Africa Kenya Nairobi
3 Africa Kenya NA
我在 R 中有一个字符向量,每个字符串由“大陆/国家/城市”组成,例如
x=rep("Africa / Kenya / Nairobi", 1000000)
但是“/”偶尔会在没有括号空格的情况下被误输入为“/”,在某些情况下城市也会丢失,例如是“非洲/肯尼亚”,没有城市。
我想将其解析为三个向量大陆、国家和城市,如果缺少城市,则使用 NA。
对于国家我现在做了类似
country = sapply(x, function(loc) trimws(strsplit(loc,"/", fixed = TRUE)[[1]][2]))
但是如果向量 x 很长,那会很慢。在 R 中解析它的有效方法是什么?
您可以在 do.call
中尝试 rbind
。在 lapply
中使用 [
是为了得到 3 个结果,以防城市丢失。
x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")
y <- do.call(rbind, lapply(strsplit(x, "/", TRUE), "[", 1:3))
y <- trimws(y, whitespace = " ")
y
# [,1] [,2] [,3]
#[1,] "Africa" "Kenya" "Nairobi"
#[2,] "Africa" "Kenya" "Nairobi"
#[3,] "Africa" "Kenya" NA
或使用data.table
:
x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")
y <- do.call(cbind, data.table::tstrsplit(x, "/", TRUE))
y <- trimws(y, whitespace = " ")
y
# [,1] [,2] [,3]
#[1,] "Africa" "Kenya" "Nairobi"
#[2,] "Africa" "Kenya" "Nairobi"
#[3,] "Africa" "Kenya" NA
基准
#x <- rep("Africa / Kenya / Nairobi", 1000000) #Timings will depend on the used dataset
n <- 1e6L
f1 <- function(n) replicate(n, paste(sample(letters, sample(5:15, 1), TRUE), collapse = ""))
f2 <- function(n) sample(c("/", " /", "/ ", " / "), n, TRUE)
set.seed(42)
x <- paste0(f1(n), f2(n), f1(n), sample(c(paste0(f2(n%/%2L), f1(n%/%2L)), rep("", n - n%/%2L))))
system.time( #Method given in the question
sapply(x, function(loc) trimws(strsplit(loc,"/", fixed = TRUE)[[1]][2])))
# User System verstrichen
# 47.718 0.004 47.798
system.time( #Using strsplit and trimws
trimws(do.call(rbind, lapply(strsplit(x, "/", TRUE), "[", 1:3)), whitespace = " "))
# User System verstrichen
# 5.446 0.008 5.454
system.time( #Using data.table::tstrsplit and trimws
trimws(do.call(cbind, data.table::tstrsplit(x, "/", TRUE)), whitespace = " "))
# User System verstrichen
# 2.365 0.012 2.376
system.time( #Using readr::read_delim from @Anoushiravan R
readr::read_delim(x, delim = "/", quote = "", trim_ws = TRUE, col_names = FALSE))
# User System verstrichen
# 1.961 0.024 2.222
system.time( #Using data.table::tstrsplit with " */ *"
do.call(cbind, data.table::tstrsplit(x, " */ *", perl=TRUE)))
# User System verstrichen
# 1.394 0.000 1.394
system.time( #Using read.table from @akrun
read.table(text = x, sep = "/", header = FALSE, fill = TRUE, strip.white = TRUE, na.strings = ""))
# User System verstrichen
# 1.298 0.004 1.302
system.time( #Using data.table::fread from @akrun
data.table::fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
# User System verstrichen
# 1.146 0.016 0.996
system.time( #Using read.table with additional argiments
read.table(text = x, sep = "/", header = FALSE, fill = TRUE, strip.white = TRUE, na.strings = "", nrows=length(x), comment.char = "", colClasses = c("character")))
# User System verstrichen
# 1.076 0.000 1.076
system.time( #Using data.table::fread with stringr::str_c (or stringi::stri_c)
data.table::fread(text = stringr::str_c(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
# User System verstrichen
# 0.780 0.000 0.624
使用 data.table::fread
并使用 stringr::str_c
创建输入字符串看起来是目前给定方法中最快的。
考虑使用 base R
read.table
read.table(text = x, sep = "/", header = FALSE,
fill = TRUE, strip.white = TRUE, na.strings = "")
V1 V2 V3
1 Africa Kenya Nairobi
2 Africa Kenya Nairobi
3 Africa Kenya <NA>
或使用 fread
来自 data.table
library(data.table)
fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = "")
Africa Kenya Nairobi
1: Africa Kenya Nairobi
2: Africa Kenya <NA>
基准
x <- rep("Africa / Kenya / Nairobi", 1000000)
>
> system.time(fread(text = paste(x, collapse="\n"), sep="/", fill = TRUE, na.strings = ""))
user system elapsed
0.473 0.024 0.496
> system.time(read.table(text = x, sep = "/", header = FALSE,
+ fill = TRUE, strip.white = TRUE, na.strings = ""))
user system elapsed
0.519 0.026 0.543
> system.time({ #Using data.table
+ y <- do.call(cbind, data.table::tstrsplit(x, "/", TRUE))
+ y <- trimws(y, whitespace = " ")
+ })
user system elapsed
2.035 0.051 2.067
数据
x <- c("Africa / Kenya / Nairobi", "Africa/Kenya/Nairobi", "Africa / Kenya")
我觉得这个也可以用:
library(readr)
xx <- readr::read_delim(b, delim = "/", quote = "", trim_ws = TRUE, col_names = FALSE)
# A tibble: 3 x 3
X1 X2 X3
<chr> <chr> <chr>
1 Africa Kenya Nairobi
2 Africa Kenya Nairobi
3 Africa Kenya NA