r 中的词干无法按预期工作
Stemming words in r does not work as expected
我正在尝试在 R 中做一个非常简单的词干并得到一些非常意想不到的东西。在下面的代码中 'complete' 变量是 'NA'。为什么我不能完全理解 easy 这个词?
library(tm)
library(SnowballC)
dict <- c("easy")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
谢谢!
wordStem()
好像可以..
library(tm)
library(SnowballC)
dict <- c("easy")
> wordStem(dict)
[1] "easi"
您可以通过 tm:::stemCompletion
查看 stemCompletion()
函数的内部结构。
function (x, dictionary, type = c("prevalent", "first", "longest", "none", "random", "shortest")){
if(inherits(dictionary, "Corpus"))
dictionary <- unique(unlist(lapply(dictionary, words)))
type <- match.arg(type)
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
switch(type, first = {
setNames(sapply(possibleCompletions, "[", 1), x)
}, longest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x),
decreasing = TRUE))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
}, none = {
setNames(x, x)
}, prevalent = {
possibleCompletions <- lapply(possibleCompletions, function(x) sort(table(x),
decreasing = TRUE))
n <- names(sapply(possibleCompletions, "[", 1))
setNames(if (length(n)) n else rep(NA, length(x)), x)
}, random = {
setNames(sapply(possibleCompletions, function(x) {
if (length(x)) sample(x, 1) else NA
}), x)
}, shortest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x)))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
})
}
x
参数是您的词干词,dictionary
是非词干词。唯一重要的是第五行;它对字典术语列表中的词干进行简单的正则表达式匹配。
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
因此它失败了,因为它找不到 "easi" 与 "easy" 的匹配项。如果您的字典中也有单词 "easiest",则两个词匹配,因为现在有一个具有相同开头四个字母的字典单词可以匹配。
library(tm)
library(SnowballC)
dict <- c("easy","easiest")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
complete
easi easiest
"easiest" "easiest"
我正在尝试在 R 中做一个非常简单的词干并得到一些非常意想不到的东西。在下面的代码中 'complete' 变量是 'NA'。为什么我不能完全理解 easy 这个词?
library(tm)
library(SnowballC)
dict <- c("easy")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
谢谢!
wordStem()
好像可以..
library(tm)
library(SnowballC)
dict <- c("easy")
> wordStem(dict)
[1] "easi"
您可以通过 tm:::stemCompletion
查看 stemCompletion()
函数的内部结构。
function (x, dictionary, type = c("prevalent", "first", "longest", "none", "random", "shortest")){
if(inherits(dictionary, "Corpus"))
dictionary <- unique(unlist(lapply(dictionary, words)))
type <- match.arg(type)
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
switch(type, first = {
setNames(sapply(possibleCompletions, "[", 1), x)
}, longest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x),
decreasing = TRUE))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
}, none = {
setNames(x, x)
}, prevalent = {
possibleCompletions <- lapply(possibleCompletions, function(x) sort(table(x),
decreasing = TRUE))
n <- names(sapply(possibleCompletions, "[", 1))
setNames(if (length(n)) n else rep(NA, length(x)), x)
}, random = {
setNames(sapply(possibleCompletions, function(x) {
if (length(x)) sample(x, 1) else NA
}), x)
}, shortest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x)))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
})
}
x
参数是您的词干词,dictionary
是非词干词。唯一重要的是第五行;它对字典术语列表中的词干进行简单的正则表达式匹配。
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
因此它失败了,因为它找不到 "easi" 与 "easy" 的匹配项。如果您的字典中也有单词 "easiest",则两个词匹配,因为现在有一个具有相同开头四个字母的字典单词可以匹配。
library(tm)
library(SnowballC)
dict <- c("easy","easiest")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
complete
easi easiest
"easiest" "easiest"