step_mutate 带有 textrecipes 标记列表
step_mutate with textrecipes tokenlists
我正在使用 tidymodels 框架进行 NLP,利用 textrecipes 包,它具有用于文本预处理的配方步骤。这里,step_tokenize
将一个字符向量作为输入,returns 一个 tokenlist
对象。现在,我想使用 hunspell 包中的函数使用自定义函数对新的标记化变量执行拼写检查以正确拼写,但出现以下错误 (link to the spell check blog post):
Error: Problem with `mutate()` column `desc`.
i `desc = correct_spelling(desc)`.
x is.character(words) is not TRUE
显然,标记列表不容易解析为字符向量。我注意到 step_untokenize
的存在,但只是通过粘贴和折叠来溶解令牌列表,这不是我需要的。
REPREX
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(hunspell)
product_descriptions <- tibble(
desc = c("goood product", "not sou good", "vad produkt"),
price = c(1000, 700, 250)
)
correct_spelling <- function(input) {
output <- case_when(
# check and (if required) correct spelling
!hunspell_check(input, dictionary('en_US')) ~
hunspell_suggest(input, dictionary('en_US')) %>%
# get first suggestion, or NA if suggestions list is empty
map(1, .default = NA) %>%
unlist(),
TRUE ~ input # if word is correct
)
# if input incorrectly spelled but no suggestions, return input word
ifelse(is.na(output), input, output)
}
product_recipe <- recipe(desc ~ price, data = product_descriptions) %>%
step_tokenize(desc) %>%
step_mutate(desc = correct_spelling(desc))
product_recipe %>% prep()
我想要什么,但没有食谱
product_descriptions %>%
unnest_tokens(word, desc) %>%
mutate(word = correct_spelling(word))
目前还没有使用{textrecipes} 执行此操作的规范方法。我们需要两样东西,一个接受标记向量和 returns 拼写检查标记(你提供的)的函数,以及一种将该函数应用于 tokenlist
的每个元素的方法。目前,没有一个通用步骤可以让您这样做,但您可以通过将函数传递给 step_stem()
中的 custom_stemmer
来欺骗它。给你想要的结果
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(textrecipes)
library(hunspell)
product_descriptions <- tibble(
desc = c("goood product", "not sou good", "vad produkt"),
price = c(1000, 700, 250)
)
correct_spelling <- function(input) {
output <- case_when(
# check and (if required) correct spelling
!hunspell_check(input, dictionary('en_US')) ~
hunspell_suggest(input, dictionary('en_US')) %>%
# get first suggestion, or NA if suggestions list is empty
map(1, .default = NA) %>%
unlist(),
TRUE ~ input # if word is correct
)
# if input incorrectly spelled but no suggestions, return input word
ifelse(is.na(output), input, output)
}
product_recipe <- recipe(desc ~ price, data = product_descriptions) %>%
step_tokenize(desc) %>%
step_stem(desc, custom_stemmer = correct_spelling) %>%
step_tf(desc)
product_recipe %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 3 × 6
#> price tf_desc_cad tf_desc_good tf_desc_not tf_desc_product tf_desc_sou
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1000 0 1 0 1 0
#> 2 700 0 1 1 0 1
#> 3 250 1 0 0 1 0
没那么短,但应该可行:
library(tidyverse)
library(hunspell)
product_descriptions <- tibble(
desc = c("goood product", "not sou good", "vad produkt"),
price = c(1000, 700, 250)
)
correct_spelling <- function(input) {
output <- case_when(
# check and (if required) correct spelling
!hunspell_check(input, dictionary('en_US')) ~
hunspell_suggest(input, dictionary('en_US')) %>%
# get first suggestion, or NA if suggestions list is empty
map(1, .default = NA) %>%
unlist(),
TRUE ~ input # if word is correct
)
# if input incorrectly spelled but no suggestions, return input word
ifelse(is.na(output), input, output)
}
my_stopwords <- c("sou")
product_descriptions %>%
#create a row identifier
mutate(id = row_number()) %>%
#separate all `desc` into separate words (by space) into separate rows
separate_rows(desc, sep = " ") %>%
#helper for naming later on
mutate(word_id = "word") %>%
#word identifier
group_by(id) %>%
mutate(word = row_number()) %>%
ungroup() %>%
#exclude stopwords as defined above
filter(!desc %in% my_stopwords) %>%
#add spellchecker
mutate(desc = correct_spelling(desc)) %>%
#make tibble wide again
pivot_wider(names_from = c(word_id, word), values_from = desc) %>%
#unite all strings that were put into separate columns
unite(desc, starts_with("word_"), remove = FALSE, sep = " ", na.rm = TRUE) %>%
#omit all helper columns
select(-c(id, starts_with("word_"))) %>%
#clean up column ordering
relocate(desc, price)
在这种情况下,“sou”作为停用词被删除,“produkt”被更正为“product”。不过,拼写检查功能将“cad”更改为“vad”而不是“bad”。
我正在使用 tidymodels 框架进行 NLP,利用 textrecipes 包,它具有用于文本预处理的配方步骤。这里,step_tokenize
将一个字符向量作为输入,returns 一个 tokenlist
对象。现在,我想使用 hunspell 包中的函数使用自定义函数对新的标记化变量执行拼写检查以正确拼写,但出现以下错误 (link to the spell check blog post):
Error: Problem with `mutate()` column `desc`.
i `desc = correct_spelling(desc)`.
x is.character(words) is not TRUE
显然,标记列表不容易解析为字符向量。我注意到 step_untokenize
的存在,但只是通过粘贴和折叠来溶解令牌列表,这不是我需要的。
REPREX
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(hunspell)
product_descriptions <- tibble(
desc = c("goood product", "not sou good", "vad produkt"),
price = c(1000, 700, 250)
)
correct_spelling <- function(input) {
output <- case_when(
# check and (if required) correct spelling
!hunspell_check(input, dictionary('en_US')) ~
hunspell_suggest(input, dictionary('en_US')) %>%
# get first suggestion, or NA if suggestions list is empty
map(1, .default = NA) %>%
unlist(),
TRUE ~ input # if word is correct
)
# if input incorrectly spelled but no suggestions, return input word
ifelse(is.na(output), input, output)
}
product_recipe <- recipe(desc ~ price, data = product_descriptions) %>%
step_tokenize(desc) %>%
step_mutate(desc = correct_spelling(desc))
product_recipe %>% prep()
我想要什么,但没有食谱
product_descriptions %>%
unnest_tokens(word, desc) %>%
mutate(word = correct_spelling(word))
目前还没有使用{textrecipes} 执行此操作的规范方法。我们需要两样东西,一个接受标记向量和 returns 拼写检查标记(你提供的)的函数,以及一种将该函数应用于 tokenlist
的每个元素的方法。目前,没有一个通用步骤可以让您这样做,但您可以通过将函数传递给 step_stem()
中的 custom_stemmer
来欺骗它。给你想要的结果
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(textrecipes)
library(hunspell)
product_descriptions <- tibble(
desc = c("goood product", "not sou good", "vad produkt"),
price = c(1000, 700, 250)
)
correct_spelling <- function(input) {
output <- case_when(
# check and (if required) correct spelling
!hunspell_check(input, dictionary('en_US')) ~
hunspell_suggest(input, dictionary('en_US')) %>%
# get first suggestion, or NA if suggestions list is empty
map(1, .default = NA) %>%
unlist(),
TRUE ~ input # if word is correct
)
# if input incorrectly spelled but no suggestions, return input word
ifelse(is.na(output), input, output)
}
product_recipe <- recipe(desc ~ price, data = product_descriptions) %>%
step_tokenize(desc) %>%
step_stem(desc, custom_stemmer = correct_spelling) %>%
step_tf(desc)
product_recipe %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 3 × 6
#> price tf_desc_cad tf_desc_good tf_desc_not tf_desc_product tf_desc_sou
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1000 0 1 0 1 0
#> 2 700 0 1 1 0 1
#> 3 250 1 0 0 1 0
没那么短,但应该可行:
library(tidyverse)
library(hunspell)
product_descriptions <- tibble(
desc = c("goood product", "not sou good", "vad produkt"),
price = c(1000, 700, 250)
)
correct_spelling <- function(input) {
output <- case_when(
# check and (if required) correct spelling
!hunspell_check(input, dictionary('en_US')) ~
hunspell_suggest(input, dictionary('en_US')) %>%
# get first suggestion, or NA if suggestions list is empty
map(1, .default = NA) %>%
unlist(),
TRUE ~ input # if word is correct
)
# if input incorrectly spelled but no suggestions, return input word
ifelse(is.na(output), input, output)
}
my_stopwords <- c("sou")
product_descriptions %>%
#create a row identifier
mutate(id = row_number()) %>%
#separate all `desc` into separate words (by space) into separate rows
separate_rows(desc, sep = " ") %>%
#helper for naming later on
mutate(word_id = "word") %>%
#word identifier
group_by(id) %>%
mutate(word = row_number()) %>%
ungroup() %>%
#exclude stopwords as defined above
filter(!desc %in% my_stopwords) %>%
#add spellchecker
mutate(desc = correct_spelling(desc)) %>%
#make tibble wide again
pivot_wider(names_from = c(word_id, word), values_from = desc) %>%
#unite all strings that were put into separate columns
unite(desc, starts_with("word_"), remove = FALSE, sep = " ", na.rm = TRUE) %>%
#omit all helper columns
select(-c(id, starts_with("word_"))) %>%
#clean up column ordering
relocate(desc, price)
在这种情况下,“sou”作为停用词被删除,“produkt”被更正为“product”。不过,拼写检查功能将“cad”更改为“vad”而不是“bad”。