拆分字符串但忽略给定字符包围的分隔符
split a string but ignore separators surrounded by given characters
我想拆分一个字符串,但只有在它没有被给定的字符集包围时才使用分隔符
当前:
strsplit("1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}","\?")
#> [[1]]
#> [1] "1 " " 2 " " (3 " " 4) " " {5 " " (6 " " 7)}"
预期:
strsplit2 <- function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE,
escape = c("()","{}","[]","''",'""',"%%")){
# ...
}
strsplit2("1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}","\?")
#> [[1]]
#> [1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
我通过一些复杂的解析解决了这个问题,但我担心性能问题,想知道正则表达式是否可以更快。
仅供参考:
我目前的解决方案(与问题不太相关)是:
parse_qm_args <- function(x){
x <- str2lang(x)
# if single symbol
if(is.symbol(x)) return(x)
i <- numeric(0)
out <- character(0)
while(identical(x[[c(i,1)]], quote(`?`)) &&
(!length(i) || length(x[[i]]) == 3)){
out <- c(x[[c(i,3)]],out)
i <- c(2, i)
}
# if no `?` was found
if(!length(out)) return(x)
if(length(x[[i]]) == 2) {
# if we have a unary `?` fetch its arg
out <- c(x[[c(i,2)]],out)
} else {
# if we have a binary `?` fetch the its first arg
out <- c(x[[c(i)]], out)
}
out
}
(*SKIP)(*FAIL)
和 perl = T
是你的朋友:
some_string <- c("1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}")
pattern <- c("(?:\{[^{}]*\}|\([^()]*\))(*SKIP)(*FAIL)|\?")
some_parts <- strsplit(some_string, pattern, perl = T)
some_parts
这会产生
[[1]]
[1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
参见a demo on regex101.com。这不适用于嵌套结构。
这是@CodeManiac 的想法的实现,并进行了一些优化和处理边缘情况。
splitter <- function(x) {
str <- strsplit(x,"")[[1]]
final <- character(0)
strTemp <- ""
count <- 0
# define escape sets
parensStart <- c("{","(")
parensClosing <- c("}",")")
parensBoth <- c("'",'"', "%")
quotes_on <- FALSE
for(i in 1:nchar(x)){
if(str[i] %in% parensBoth){
# handle quotes
strTemp <- c(strTemp,str[i])
if(!quotes_on) {
quotes_on <- TRUE
count <- 1 # no need to count here, just make it non zero
} else {
quotes_on <- FALSE
count <- 0
}
i <- i + 1
next
}
if(str[i] == "?" && count == 0){
# if found `?` reinitialise strTemp and count and append final
final <- c(final, paste(strTemp, collapse=""))
strTemp <- ""
count <- 0
i <- i + 1
next
}
strTemp <- c(strTemp,str[i])
if(str[i] %in% parensStart){
# increment count entering set
count <- count+1
} else if(str[i] %in% parensClosing){
# decrement if exiting set
count <- count-1
}
i <- i + 1
}
# append what's left
final <- c(final, paste(strTemp, collapse=""))
final
}
结果:
x1 <- "1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}"
splitter(x1)
#> [1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
x2 <- "1 ? 2 ? '3 ? 4' ? {5 ? (6 ? 7)}"
splitter(x2)
#> [1] "1 " " 2 " " '3 ? 4' " " {5 ? (6 ? 7)}"
我在写问题时没有想到的一个边缘情况,引号之间的字符不是分隔符的候选者
x3 <- "1 ? 2 ? '3 {(? 4' ? {5 ? (6 ? 7)}"
splitter(x3)
#> [1] "1 " " 2 " " '3 {(? 4' " " {5 ? (6 ? 7)}"
基准
到目前为止,解析速度提高了 10 倍,但可以通过使用 Rcpp 进一步优化上述解决方案。解析解决方案也可能会进一步优化。
Jan 和 Onyambu 的解决方案更加紧凑和优雅。 Onyambu 处理嵌套、引号和引号中分隔符的边缘情况(尽管不是问题的一部分),而 Jan 则没有。而且它们的速度几乎一样快。
regex_split_jan <- function(x){
pattern <- c("(?:\{[^{}]*\}|\([^()]*\))(*SKIP)(*FAIL)|\?")
out <- strsplit(x, pattern, perl = T)[[1]]
out
}
regex_split_onyambu <- function(x){
pattern <- c("([({'](?:[^(){}']*|(?1))*[')}])(*SKIP)(*FAIL)|\?")
out <- strsplit(x, pattern, perl = T)[[1]]
out
}
microbenchmark::microbenchmark(
regex_jan = as.list(parse(text=regex_split_jan(x))),
regex_onyambu = as.list(parse(text=regex_split_onyambu(x))),
loop = as.list(parse(text=splitter(x))),
parse = parse_qm_args(x)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> regex_jan 89.1 92.15 112.114 92.95 94.45 1893.5 100 b
#> regex_onyambu 91.0 93.50 116.850 94.95 96.45 2056.1 100 b
#> loop 122.0 125.95 130.289 128.30 131.20 169.8 100 b
#> parse 10.7 13.55 14.642 14.80 15.65 25.3 100 a
最好的办法是使用递归。在这种情况下,您将捕获所有分组的元素,然后在未分组的分隔符上拆分:
pattern = "([({'](?:[^(){}']*|(?1))*[')}])(*SKIP)(*FAIL)|\?"
x1 <- "1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}"
x2 <- "1 ? 2 ? '3 ? 4' ? {5 ? (6 ? 7)}"
x3 <- "1 ? 2 ? '3 {(? 4' ? {5 ? (6 ? 7)}"
x4 <- "1 ? 2 ? '(3 ? 4) ? {5 ? (6 ? 7)}'"
strsplit(c(x1,x2,x3, x4),pattern,perl=TRUE)
[[1]]
[1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
[[2]]
[1] "1 " " 2 " " '3 ? 4' " " {5 ? (6 ? 7)}"
[[3]]
[1] "1 " " 2 " " '3 {(? 4' " " {5 ? (6 ? 7)}"
[[4]]
[1] "1 " " 2 " " '(3 ? 4) ? {5 ? (6 ? 7)}'"
我想拆分一个字符串,但只有在它没有被给定的字符集包围时才使用分隔符
当前:
strsplit("1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}","\?")
#> [[1]]
#> [1] "1 " " 2 " " (3 " " 4) " " {5 " " (6 " " 7)}"
预期:
strsplit2 <- function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE,
escape = c("()","{}","[]","''",'""',"%%")){
# ...
}
strsplit2("1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}","\?")
#> [[1]]
#> [1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
我通过一些复杂的解析解决了这个问题,但我担心性能问题,想知道正则表达式是否可以更快。
仅供参考:
我目前的解决方案(与问题不太相关)是:
parse_qm_args <- function(x){
x <- str2lang(x)
# if single symbol
if(is.symbol(x)) return(x)
i <- numeric(0)
out <- character(0)
while(identical(x[[c(i,1)]], quote(`?`)) &&
(!length(i) || length(x[[i]]) == 3)){
out <- c(x[[c(i,3)]],out)
i <- c(2, i)
}
# if no `?` was found
if(!length(out)) return(x)
if(length(x[[i]]) == 2) {
# if we have a unary `?` fetch its arg
out <- c(x[[c(i,2)]],out)
} else {
# if we have a binary `?` fetch the its first arg
out <- c(x[[c(i)]], out)
}
out
}
(*SKIP)(*FAIL)
和 perl = T
是你的朋友:
some_string <- c("1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}")
pattern <- c("(?:\{[^{}]*\}|\([^()]*\))(*SKIP)(*FAIL)|\?")
some_parts <- strsplit(some_string, pattern, perl = T)
some_parts
这会产生
[[1]]
[1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
参见a demo on regex101.com。这不适用于嵌套结构。
这是@CodeManiac 的想法的实现,并进行了一些优化和处理边缘情况。
splitter <- function(x) {
str <- strsplit(x,"")[[1]]
final <- character(0)
strTemp <- ""
count <- 0
# define escape sets
parensStart <- c("{","(")
parensClosing <- c("}",")")
parensBoth <- c("'",'"', "%")
quotes_on <- FALSE
for(i in 1:nchar(x)){
if(str[i] %in% parensBoth){
# handle quotes
strTemp <- c(strTemp,str[i])
if(!quotes_on) {
quotes_on <- TRUE
count <- 1 # no need to count here, just make it non zero
} else {
quotes_on <- FALSE
count <- 0
}
i <- i + 1
next
}
if(str[i] == "?" && count == 0){
# if found `?` reinitialise strTemp and count and append final
final <- c(final, paste(strTemp, collapse=""))
strTemp <- ""
count <- 0
i <- i + 1
next
}
strTemp <- c(strTemp,str[i])
if(str[i] %in% parensStart){
# increment count entering set
count <- count+1
} else if(str[i] %in% parensClosing){
# decrement if exiting set
count <- count-1
}
i <- i + 1
}
# append what's left
final <- c(final, paste(strTemp, collapse=""))
final
}
结果:
x1 <- "1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}"
splitter(x1)
#> [1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
x2 <- "1 ? 2 ? '3 ? 4' ? {5 ? (6 ? 7)}"
splitter(x2)
#> [1] "1 " " 2 " " '3 ? 4' " " {5 ? (6 ? 7)}"
我在写问题时没有想到的一个边缘情况,引号之间的字符不是分隔符的候选者
x3 <- "1 ? 2 ? '3 {(? 4' ? {5 ? (6 ? 7)}"
splitter(x3)
#> [1] "1 " " 2 " " '3 {(? 4' " " {5 ? (6 ? 7)}"
基准
到目前为止,解析速度提高了 10 倍,但可以通过使用 Rcpp 进一步优化上述解决方案。解析解决方案也可能会进一步优化。
Jan 和 Onyambu 的解决方案更加紧凑和优雅。 Onyambu 处理嵌套、引号和引号中分隔符的边缘情况(尽管不是问题的一部分),而 Jan 则没有。而且它们的速度几乎一样快。
regex_split_jan <- function(x){
pattern <- c("(?:\{[^{}]*\}|\([^()]*\))(*SKIP)(*FAIL)|\?")
out <- strsplit(x, pattern, perl = T)[[1]]
out
}
regex_split_onyambu <- function(x){
pattern <- c("([({'](?:[^(){}']*|(?1))*[')}])(*SKIP)(*FAIL)|\?")
out <- strsplit(x, pattern, perl = T)[[1]]
out
}
microbenchmark::microbenchmark(
regex_jan = as.list(parse(text=regex_split_jan(x))),
regex_onyambu = as.list(parse(text=regex_split_onyambu(x))),
loop = as.list(parse(text=splitter(x))),
parse = parse_qm_args(x)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> regex_jan 89.1 92.15 112.114 92.95 94.45 1893.5 100 b
#> regex_onyambu 91.0 93.50 116.850 94.95 96.45 2056.1 100 b
#> loop 122.0 125.95 130.289 128.30 131.20 169.8 100 b
#> parse 10.7 13.55 14.642 14.80 15.65 25.3 100 a
最好的办法是使用递归。在这种情况下,您将捕获所有分组的元素,然后在未分组的分隔符上拆分:
pattern = "([({'](?:[^(){}']*|(?1))*[')}])(*SKIP)(*FAIL)|\?"
x1 <- "1 ? 2 ? (3 ? 4) ? {5 ? (6 ? 7)}"
x2 <- "1 ? 2 ? '3 ? 4' ? {5 ? (6 ? 7)}"
x3 <- "1 ? 2 ? '3 {(? 4' ? {5 ? (6 ? 7)}"
x4 <- "1 ? 2 ? '(3 ? 4) ? {5 ? (6 ? 7)}'"
strsplit(c(x1,x2,x3, x4),pattern,perl=TRUE)
[[1]]
[1] "1 " " 2 " " (3 ? 4) " " {5 ? (6 ? 7)}"
[[2]]
[1] "1 " " 2 " " '3 ? 4' " " {5 ? (6 ? 7)}"
[[3]]
[1] "1 " " 2 " " '3 {(? 4' " " {5 ? (6 ? 7)}"
[[4]]
[1] "1 " " 2 " " '(3 ? 4) ? {5 ? (6 ? 7)}'"