替换R中字符串中的第N个字符

Replacing N-th character in a string in R

在加拿大,邮政编码的格式为“B7J 6B1”。 为了清理邮政编码,我需要替换所有拼写错误,以便“81J 8BL”变成“B7J 6B1”。 换句话说,我需要一个函数来将字符串 .str 中的第 N 个字符从“A”替换为“B”(即,如果它是 A,则将其替换为 B,否则它什么也不做 - 类似于str_replace() 功能,但在字符级别):

str_replaceCharacter <- function (.str, N, a, b) { ... }

理想情况下,我需要一个非常快速的函数,这样我就可以 运行 处理数百万条记录,如

dt[, ZIP := str_replaceCharacter (ZIP, 1, "8", "B")] [
   , ZIP := str_replaceCharacter (ZIP, 3, "8", "B")] [
   , ZIP := str_replaceCharacter (ZIP, 5, "8", "B")] [
   , ZIP := str_replaceCharacter (ZIP, 2, "L", "1")] [
   , ZIP := str_replaceCharacter (ZIP, 4, "L", "1")] [
   , ZIP := str_replaceCharacter (ZIP, 6, "L", "1")] [ and so on - 30 more lines like this]

不知道它的性能如何,但这里有一个 base 不依赖正则表达式的解决方案:

str_replace_character = function(string, index, pattern, replacement) {
  needs_replacement = substr(string, index, index) == pattern
  substr(string[needs_replacement], index, index) = replacement
  return(string)
}

str_replace_character(c("B7J 6B1", "81J 8BL", "ABC"), 1, "8", "B")
# [1] "B7J 6B1" "B1J 8BL" "ABC"  

您可以使用以下

str_replaceCharacter <- function (my.string, N, a, b) {
  characters <- unlist(strsplit(my.string, ""))
  
  if (characters[N] == a)
    characters[N] <- b
  
  my.string <- paste(characters, collapse = "")
  return(my.string)
}



my.string <- "81J 8BL"

replaced <- str_replaceCharacter(my.string, 2, "1", "7")
replaced

然后简单地使用 apply 函数,因为这通常非常快。但是,如果这仍然不够快,我建议使用 mclapply,它是 lapply.

的 multicoe 实现

从性能的角度来看,这个实验

str_replaceCharacter <- function (my.string, N, a, b) {
  characters <- unlist(strsplit(my.string, ""))
  
  if (characters[N] == a)
    characters[N] <- b
  
  my.string <- paste(characters, collapse = "")
  return(my.string)
}



my.string <- "81J 8BL"

first     <- rep_len(my.string, 10)
second    <- rep_len(my.string, 100)
third     <- rep_len(my.string, 1000)
fourth    <- rep_len(my.string, 10000)
fifth     <- rep_len(my.string, 100000)
sixth     <- rep_len(my.string, 1000000)
seventh   <- rep_len(my.string, 10000000)

导致

> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
      0       0       0 
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.001   0.000   0.000 
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.004   0.000   0.005 
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.048   0.000   0.048 
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.484   0.000   0.485 
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  5.487   0.000   5.487 
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
 66.065   0.286  66.356 

以及变量 elapsed 的绘图。

回答导致我的电脑出现以下问题。

> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.003   0.000   0.002 
> 
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.000   0.000   0.001 
> 
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.003   0.000   0.004 
> 
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.037   0.000   0.037 
> 
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.359   0.000   0.359 
> 
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  4.990   0.019   5.010 
> 
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
 49.599   0.167  49.764 

并且答案导致

> system.time(lapply(first, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.003   0.000   0.027 
> 
> system.time(lapply(second, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.001   0.000   0.001 
> 
> system.time(lapply(third, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.005   0.000   0.006 
> 
> system.time(lapply(fourth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.056   0.000   0.055 
> 
> system.time(lapply(fifth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  0.588   0.000   0.588 
> 
> system.time(lapply(sixth, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
  6.067   0.000   6.065 
> 
> system.time(lapply(seventh, str_replaceCharacter, N = 2, a = "1", b = "7"))
   user  system elapsed 
 81.439   0.016  81.449 

因此 Thomas 的回答似乎表现最好。

我们可以使用 str_sub 来自 stringr

library(stringr)
str_replace_character <- function(string, index, pattern, replacement) {
  needs_replacement <- str_sub(string, index, index) == pattern
  str_sub(string[needs_replacement], index, index) <- replacement
  return(string)
}
str_replace_character(c("B7J 6B1", "81J 8BL", "ABC"), 1, "8", "B")