用 R 中拼写正确的单词替换拼写错误的单词的功能?

Function to replace incorrectly spelled words with correctly spelled words in R?

我为1000行的样本构建了拼写检查功能以确保其效率,使用'hunspell'包和澳大利亚英语词典。拼写检查器忽略缩写。 我的实际数据有将近 200 万行,因此我需要将 'for' 循环转换为 'apply' 系列函数。

我快到了,但最后一部分不工作。 以下是原始的 for 循环函数:

for(i in 1:nrow(data_words))
{
  print(i)
  
  string1 <- data_words$title[i]
  string2 <- ""
  
  for(j in 1:sapply(strsplit(string1, " "), length))
  {
    w <- word(string1, j)
    
    # if word is not an abbreviation
    if (!isAbbreviation(w))
    {
      # correct word
      w <- correctText(w)
    }
    
    string2 <- paste0(string2, w, sep = " ")
    
    # add word in new column 'spell_check'
    data_words$spell_check[i] <- string2
    
  }
}

isAbbreviation <- function(x)
{
  abb = FALSE
  
  # all capitalised letters
  if(str_detect(x, "^[:upper:]+$"))
  {
    abb = TRUE
  }
  
  # dealing with abbs that end in an 's'
  b = str_extract_all(x, "(\b[A-Z]+\b)|\b[A-Z]+s+\b")
  list_empty = rlang::is_empty(unlist(b))
  
  if(!list_empty)
  {
    abb = TRUE
  }
  return(abb)
}

correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    # get misspelled words
    bad_words = hunspell(x[y], dict = "en_AU")[[1]]
    
    # if list of misspelled words is not empty
    if(length(bad_words))
    {
      for (i in 1:length(bad_words))
      {
        list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_words[i], 
                                                             dict = "en_AU")))
        # if suggestion list is not empty
        if(!list_empty)
        {
          # correct word
          good = unlist(lapply(hunspell_suggest(bad_words[i], dict = "en_AU"), `[[`, 1))
        }
        else
        {
          # else leave word is it is
          good = bad_words[i]
        }
        # replace mispelled words with corrected ones
        x[y] <<- gsub(bad_words[i], good, x[y])
      }
    }
  })
  x
}

要更正的可重现短语样本:

library(dplyr)
library(stringr)
library(hunspell)
library(textclean)

sample <- 
  c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", "Zika virus preegnancy update")

data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)

我试了一下,请参考以下功能:

# the abbreviation function remains the same

# function to correct a misspelled word
correctTheWord <- function(bad_word)
{
  # print(bad_word)
  
  if (!isAbbreviation(bad_word))
  {
    list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_word,
                                                         dict = "en_AU")))
    
    if (!list_empty)
    {
      good = unlist(
        lapply(hunspell_suggest(bad_word, dict = "en_AU"),
               `[[`,
               1
        ))
    }
    else
    {
      good = bad_word
    }
  }
  
  else
  {
    good = bad_word
  }
}

# correct whole row function
correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    bad = hunspell(x[y], dict = "en_AU")[[1]]
    
    if (length(bad))
    {
      return(mgsub(x, bad, lapply(bad, correctTheWord)))
    }
    else
    {
      return(x)
    }
  })
}


# testing the first 2 titles
correctText("Paaediatrics AsseSssing Febrile Infant Child")
correctText("Manuual Handling Traain Trainer")


# this is not working 
data_words$spell_check <- 
  apply(data_words[, 1], 2,  correctText)

另外,我的功能可以进一步简化吗?

这将识别拼写错误的单词并将其替换为正确的拼写。请注意,它会根据需要忽略缩写,并假定所有单词都由 space.

分隔

# First, define isAbbreviation

isAbbreviation <- function(x)
{
  abb = FALSE
  
  # all capitalised letters
  if(str_detect(x, "^[:upper:]+$"))
  {
    abb = TRUE
  }
  
  # dealing with abbs that end in an 's'
  b = str_extract_all(x, "(\b[A-Z]+\b)|\b[A-Z]+s+\b")
  list_empty = rlang::is_empty(unlist(b))
  
  if(!list_empty)
  {
    abb = TRUE
  }
  return(abb)
}


sample <- 
  c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", 
    "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", 
    "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", 
    "Zika virus preegnancy update", "Basic Labelinsfbsbinsajectables technical")

data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)


correct_spelling <- function(text) {
  
  words <- text %>% 
  str_split(" ") %>% 
  .[[1]]

  abbreviation <- words %>% sapply(isAbbreviation) %>% 
    unname
  
  # Abbreviations return false here, which is inconsequential since we don't replace them 
  correct <- words %>% 
    sapply(function(x) {hunspell_check(x, dict = dictionary("en_AU")) } ) %>% 
    unname
  
  # Correct the word if incorrect and not abbreviation
  if(!any(!(!abbreviation) & (!correct))) {
  
    misspelled_and_not_abbreviation <- words[(!abbreviation) & (!correct)] 
  
  
    suggestions <- misspelled_and_not_abbreviation %>% 
      hunspell_suggest(dict = dictionary("en_AU")) 
    
    suggested_words <- sapply(seq_along(suggestions), function(y, i) 
      { ifelse(length(y[[1]]) == 0, misspelled_and_not_abbreviation[i], y[[i]][1]) }, 
      y=suggestions)

    words[as.logical((!abbreviation) * (!correct))] <- suggested_words 
  
  }
  
  words %>% paste0(collapse = " ")

  
}

data_words$spell_check2 <- data_words$title %>% sapply(correct_spelling) %>% unname

这给出了

data_words

#    title                                        spell_check2                              
#    <chr>                                        <chr>                                     
#  1 Paaediatrics AsseSssing Febrile Infant Child Paediatrics Assessing Febrile Infant Child
#  2 Manuual Handling Traain Trainer              Manual Handling Train Trainer             
#  3 Catheterise CTHs                             Catheterise CTHs                          
#  4 Labelinsfbsbinsajectables                    Labelinsfbsbinsajectables                 
#  5 Mentouring                                   Mentoring                                 
#  6 techhnical                                   technical                                 
#  7 Basic Life Support BSL                       Basic Life Support BSL                    
#  8 BloodSafe cliniiical transfusion practice    Blood Safe clinical transfusion practice  
#  9 Astthma                                      Asthma                                    
# 10 Zika virus preegnancy update                 Erika virus pregnancy update              
# 11 Basic Labelinsfbsbinsajectables technical    Basic Labelinsfbsbinsajectables technical