替换R中字符串中的第N个字符
Replacing N-th character in a string in R
在加拿大,邮政编码的格式为“B7J 6B1”。
为了清理邮政编码,我需要替换所有拼写错误,以便“81J 8BL”变成“B7J 6B1”。
换句话说,我需要一个函数来将字符串 .str
中的第 N 个字符从“A”替换为“B”(即,如果它是 A,则将其替换为 B,否则它什么也不做 - 类似于str_replace()
功能,但在字符级别):
str_replaceCharacter <- function (.str, N, a, b) { ... }
理想情况下,我需要一个非常快速的函数,这样我就可以 运行 处理数百万条记录,如
dt[, ZIP := str_replaceCharacter (ZIP, 1, "8", "B")] [
, ZIP := str_replaceCharacter (ZIP, 3, "8", "B")] [
, ZIP := str_replaceCharacter (ZIP, 5, "8", "B")] [
, ZIP := str_replaceCharacter (ZIP, 2, "L", "1")] [
, ZIP := str_replaceCharacter (ZIP, 4, "L", "1")] [
, ZIP := str_replaceCharacter (ZIP, 6, "L", "1")] [ and so on - 30 more lines like this]
不知道它的性能如何,但这里有一个 base
不依赖正则表达式的解决方案:
str_replace_character = function(string, index, pattern, replacement) {
needs_replacement = substr(string, index, index) == pattern
substr(string[needs_replacement], index, index) = replacement
return(string)
}
str_replace_character(c("B7J 6B1", "81J 8BL", "ABC"), 1, "8", "B")
# [1] "B7J 6B1" "B1J 8BL" "ABC"
您可以使用以下
str_replaceCharacter <- function (my.string, N, a, b) {
characters <- unlist(strsplit(my.string, ""))
if (characters[N] == a)
characters[N] <- b
my.string <- paste(characters, collapse = "")
return(my.string)
}
my.string <- "81J 8BL"
replaced <- str_replaceCharacter(my.string, 2, "1", "7")
replaced
然后简单地使用 apply
函数,因为这通常非常快。但是,如果这仍然不够快,我建议使用 mclapply
,它是 lapply
.
的 multicoe 实现
从性能的角度来看,这个实验
str_replaceCharacter <- function (my.string, N, a, b) {
characters <- unlist(strsplit(my.string, ""))
if (characters[N] == a)
characters[N] <- b
my.string <- paste(characters, collapse = "")
return(my.string)
}
my.string <- "81J 8BL"
first <- rep_len(my.string, 10)
second <- rep_len(my.string, 100)
third <- rep_len(my.string, 1000)
fourth <- rep_len(my.string, 10000)
fifth <- rep_len(my.string, 100000)
sixth <- rep_len(my.string, 1000000)
seventh <- rep_len(my.string, 10000000)
导致
> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0 0 0
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.001 0.000 0.000
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.004 0.000 0.005
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.048 0.000 0.048
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.484 0.000 0.485
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
5.487 0.000 5.487
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
66.065 0.286 66.356
以及变量 elapsed
的绘图。
回答导致我的电脑出现以下问题。
> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.003 0.000 0.002
>
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.000 0.000 0.001
>
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.003 0.000 0.004
>
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.037 0.000 0.037
>
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.359 0.000 0.359
>
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
4.990 0.019 5.010
>
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
49.599 0.167 49.764
并且答案导致
> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.003 0.000 0.027
>
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.001 0.000 0.001
>
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.005 0.000 0.006
>
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.056 0.000 0.055
>
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.588 0.000 0.588
>
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
6.067 0.000 6.065
>
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
81.439 0.016 81.449
因此 Thomas 的回答似乎表现最好。
我们可以使用 str_sub
来自 stringr
library(stringr)
str_replace_character <- function(string, index, pattern, replacement) {
needs_replacement <- str_sub(string, index, index) == pattern
str_sub(string[needs_replacement], index, index) <- replacement
return(string)
}
str_replace_character(c("B7J 6B1", "81J 8BL", "ABC"), 1, "8", "B")
在加拿大,邮政编码的格式为“B7J 6B1”。
为了清理邮政编码,我需要替换所有拼写错误,以便“81J 8BL”变成“B7J 6B1”。
换句话说,我需要一个函数来将字符串 .str
中的第 N 个字符从“A”替换为“B”(即,如果它是 A,则将其替换为 B,否则它什么也不做 - 类似于str_replace()
功能,但在字符级别):
str_replaceCharacter <- function (.str, N, a, b) { ... }
理想情况下,我需要一个非常快速的函数,这样我就可以 运行 处理数百万条记录,如
dt[, ZIP := str_replaceCharacter (ZIP, 1, "8", "B")] [
, ZIP := str_replaceCharacter (ZIP, 3, "8", "B")] [
, ZIP := str_replaceCharacter (ZIP, 5, "8", "B")] [
, ZIP := str_replaceCharacter (ZIP, 2, "L", "1")] [
, ZIP := str_replaceCharacter (ZIP, 4, "L", "1")] [
, ZIP := str_replaceCharacter (ZIP, 6, "L", "1")] [ and so on - 30 more lines like this]
不知道它的性能如何,但这里有一个 base
不依赖正则表达式的解决方案:
str_replace_character = function(string, index, pattern, replacement) {
needs_replacement = substr(string, index, index) == pattern
substr(string[needs_replacement], index, index) = replacement
return(string)
}
str_replace_character(c("B7J 6B1", "81J 8BL", "ABC"), 1, "8", "B")
# [1] "B7J 6B1" "B1J 8BL" "ABC"
您可以使用以下
str_replaceCharacter <- function (my.string, N, a, b) {
characters <- unlist(strsplit(my.string, ""))
if (characters[N] == a)
characters[N] <- b
my.string <- paste(characters, collapse = "")
return(my.string)
}
my.string <- "81J 8BL"
replaced <- str_replaceCharacter(my.string, 2, "1", "7")
replaced
然后简单地使用 apply
函数,因为这通常非常快。但是,如果这仍然不够快,我建议使用 mclapply
,它是 lapply
.
从性能的角度来看,这个实验
str_replaceCharacter <- function (my.string, N, a, b) {
characters <- unlist(strsplit(my.string, ""))
if (characters[N] == a)
characters[N] <- b
my.string <- paste(characters, collapse = "")
return(my.string)
}
my.string <- "81J 8BL"
first <- rep_len(my.string, 10)
second <- rep_len(my.string, 100)
third <- rep_len(my.string, 1000)
fourth <- rep_len(my.string, 10000)
fifth <- rep_len(my.string, 100000)
sixth <- rep_len(my.string, 1000000)
seventh <- rep_len(my.string, 10000000)
导致
> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0 0 0
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.001 0.000 0.000
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.004 0.000 0.005
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.048 0.000 0.048
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.484 0.000 0.485
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
5.487 0.000 5.487
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
66.065 0.286 66.356
以及变量 elapsed
的绘图。
> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.003 0.000 0.002
>
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.000 0.000 0.001
>
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.003 0.000 0.004
>
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.037 0.000 0.037
>
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.359 0.000 0.359
>
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
4.990 0.019 5.010
>
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
49.599 0.167 49.764
并且
> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.003 0.000 0.027
>
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.001 0.000 0.001
>
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.005 0.000 0.006
>
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.056 0.000 0.055
>
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
0.588 0.000 0.588
>
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
6.067 0.000 6.065
>
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
user system elapsed
81.439 0.016 81.449
因此 Thomas 的回答似乎表现最好。
我们可以使用 str_sub
来自 stringr
library(stringr)
str_replace_character <- function(string, index, pattern, replacement) {
needs_replacement <- str_sub(string, index, index) == pattern
str_sub(string[needs_replacement], index, index) <- replacement
return(string)
}
str_replace_character(c("B7J 6B1", "81J 8BL", "ABC"), 1, "8", "B")