使用哈希字典的词形还原函数不适用于 R 中的 tm 包
A lemmatizing function using a hash dictionary does not work with tm package in R
我想使用大型外部词典(格式如下面的 txt 变量)对波兰语文本进行词形还原。我不走运,有一个选项 Polish with popular text mining packages。 @DmitriySelivanov 的答案 适用于简单的文本向量。 (我还从词典和语料库中删除了波兰语变音符号。)该函数适用于文本向量。
很遗憾,它不适用于 tm 生成的语料库格式。让我粘贴 Dmitriy 的代码:
library(hashmap)
library(data.table)
txt =
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami
"
dt = fread(txt, header = F, col.names = c("lemma", "word"))
lemma_hm = hashmap(dt$word, dt$lemma)
lemma_hm[["Abadanu"]]
#"Abadan"
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
tokens_list
}
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
#[[1]]
#[1] "Abadan" "abadańczyk" "OutOfVocabulary"
#[[2]]
#[1] "abadańczyk" "Abadan" "OutOfVocabulary"
现在我想将它应用于 tm 语料库 "docs" 这是我将在 tm 生成的语料库上与 tm 包一起使用的示例语法。
docs <- tm_map(docs, function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm"))
我尝试的另一种语法:
LemmaTokenizer <- function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm")
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(4, 25), tokenize=LemmaTokenizer))
它给我一个错误:
Error in lemma_hashmap[[tokens]] :
attempt to select more than one element in vectorIndex
该函数适用于文本向量,但不适用于 tm 语料库。预先感谢您的建议(如果它不能与 tm 一起使用,甚至可以将此功能与其他文本挖掘包一起使用)。
我在这里看到两个问题。 1) 您的自定义函数 return 是一个列表,而它应该 return 是一个字符串向量;和 2) 你传递了一个错误的 lemma_hashmap 参数。
解决第一个问题的快速解决方法是在 return 函数结果之前使用 paste() 和 sapply()。
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
我们可以运行你的例子post。
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
[1] "Abadan abadańczyk OutOfVocabulary" "abadańczyk Abadan OutOfVocabulary"
现在,我们可以使用 tm_map。只需确保使用 lemma_hm(即变量)而不是 "lemma_hm"(字符串)作为参数。
docs <- SimpleCorpus(VectorSource(texts))
out <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
out[[1]]$content
[1] "Abadan abadańczyk OutOfVocabulary"
尝试使用 quanteda 的 dictionary()
函数,在创建字典后将每个变体映射为字典值,将词元作为字典键。下面,它会查找您的值,然后将标记粘贴回文本中。 (如果你想要令牌,你不需要最后的 paste()
操作。
txt <-
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami"
list_temp <- strsplit(readLines(textConnection(txt)), "\s+")
list_temp2 <- lapply(list_temp, "[", 2)
names(list_temp2) <- sapply(list_temp, "[", 1)
library("quanteda")
polish_lemma_dict <- dictionary(list_temp2)
# Dictionary object with 7 key entries.
# - Abadan:
# - abadanem
# - Abadan:
# - abadanie
# - Abadan:
# - abadanowi
# - Abadan:
# - abadanu
# - abadańczyk:
# - abadańczycy
# - abadańczyk:
# - abadańczykach
# - abadańczyk:
# - abadańczykami
texts <- c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
texts
现在可以转换为标记,并使用quanteda的tokens_lookup()
函数将字典值(变形词)替换为字典键(词条)。在最后一步中,我将标记重新粘贴在一起,如果您需要标记而不是全文,可以跳过这一步。
require(magrittr)
texts %>%
tokens() %>%
tokens_lookup(dictionary = polish_lemma_dict, exclusive = FALSE, capkeys = FALSE) %>%
as.character() %>%
paste(collapse = " ")
# [1] "Abadan abadańczyk OutOfVocabulary abadańczyk Abadan OutOfVocabulary"
波兰语词形还原请参考这个脚本
https://github.com/MarcinKosinski/trigeR5/blob/master/R/lematyzacja.R that uses this polmorfologik dictionary https://github.com/MarcinKosinski/trigeR5/tree/master/dicts(并且停用词也包含在那里)。
这是我在其中使用答案的完整的不完美代码。感谢许多人,我在底部描述了所有来源。这是非常粗糙的,我意识到,但它对我来说是错误的,即。我可以使用 txt lemmes 词典和我的停用词来对波兰语文本进行分类。感谢 Damiano Fantini、Dmitriy Selivanov 和许多其他人。
#----1. Set up. ----
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(readtext)
library(tm)
library(proxy)
library(stringi)
library(stringr)
library(hashmap)
library(data.table)
library(text2vec)
# For reading n-grams
library(RWeka) #(*)
BigramTokenizer <-
function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3)) #(*)
#----2. Read data. ----
stopwordsPL <- as.vector(str_split(readLines("polish.stopwords.text",encoding = "UTF-8"), pattern = " ",simplify = T))
docs <- VCorpus(DirSource(pattern="txt"))
titles <- rownames(summary(docs))
docs <- tm_map(docs, removeWords, words=stopwordsPL)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, function(x) stri_trans_general(x, "Latin-ASCII"))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# for English texts it would be simpler
# docs <- tm_map(docs, removeWords, stopwords("english")) #can add other words to remove
# docs <- tm_map(docs, stemDocument, "english")
#====3. Lemmatize ====
# # Dictionary from http://www.lexiconista.com/datasets/lemmatization/
# lemmadict_file = "lemmatization-pl.text"
# dt = fread(file= lemmadict_file, header = F, col.names = c("lemma", "word"), data.table=T, encoding="UTF-8")
# # I threw away Polish letters, maybe changing locales may help.
# dt$lemma <- stri_trans_general(dt$lemma, "Latin-ASCII;lower")
# dt$word <- stri_trans_general(dt$word, "Latin-ASCII;lower")
# dt <- unique(dt)
#
# # Creating hash dictionary
# lemma_hm = hashmap(dt$word, dt$lemma)
#
# # Test if it works
# lemma_hm[["mnozyl"]]
# # [1] "mnozyc"
#
# save_hashmap(lemma_hm, file="lemma_hm", overwrite = TRUE, compress = TRUE)
lemma_hm <- load_hashmap(file="lemma_hm")
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
docs <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
docs <- tm_map(docs, PlainTextDocument)
#====4. Create document term matrix====
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(5, 25),tokenize = BigramTokenizer)) # tokenize=LemmaTokenizer, tokenize = BigramTokenizer (*)
docsTDM$dimnames
#====5. Remove sparse and common words====
docsTDM <- removeSparseTerms(docsTDM, .90)
#
removeCommonTerms <- function (x, pct)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(pct), pct > 0, pct < 1)
m <- if (inherits(x, "DocumentTermMatrix"))
t(x)
else x
t <- table(m$i) < m$ncol * (pct)
termIndex <- as.numeric(names(t[t]))
if (inherits(x, "DocumentTermMatrix"))
x[, termIndex]
else x[termIndex, ]
}
docsTDM <-
removeCommonTerms(docsTDM, .8) #remove terms that are in >=80% of the documents
docsTDM$dimnames
#====6. Cluster data (hclust). ====
docsdissim <- dist(as.matrix(docsTDM), method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
dim(docsdissim2)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
h <- hclust(docsdissim, method = "ward.D2")
plot(h, labels = titles, sub = "")
# Library hclust with p-values (pvclust)
library(pvclust)
h_pv <- pvclust(docsdissim2, method.hclust = "ward.D2", method.dist ="correlation")
plot(h_pv)
data.frame(cutree(tree = h_pv$hclust, k = 4))
# pvclust provides two types of p-values: AU (Approximately Unbiased) p-value and BP (Bootstrap Probability) value.
# AU p-value, which is computed by multiscale bootstrap resampling, is a better approximation to unbiased p-value
# than BP value computed by normal bootstrap resampling.
# AU p-value > 0.95 we can assume the clusters exist and may stably be
# observed if we increase the number of observations.
# (http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/)
#==== Literature:====
# Original article:
# http://www.rexamine.com/2014/06/text-mining-in-r-automatic-categorization-of-wikipedia-articles/
# Updates to make it work after some functions became obsolete:
#
#
#
# Based on that:
# http://brazenly.blogspot.co.uk/2015/02/r-categorization-clustering-of.html
#
# Sparse terms:
#
# Lemmatizing function:
#
#
我想使用大型外部词典(格式如下面的 txt 变量)对波兰语文本进行词形还原。我不走运,有一个选项 Polish with popular text mining packages。 @DmitriySelivanov 的答案
很遗憾,它不适用于 tm 生成的语料库格式。让我粘贴 Dmitriy 的代码:
library(hashmap)
library(data.table)
txt =
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami
"
dt = fread(txt, header = F, col.names = c("lemma", "word"))
lemma_hm = hashmap(dt$word, dt$lemma)
lemma_hm[["Abadanu"]]
#"Abadan"
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
tokens_list
}
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
#[[1]]
#[1] "Abadan" "abadańczyk" "OutOfVocabulary"
#[[2]]
#[1] "abadańczyk" "Abadan" "OutOfVocabulary"
现在我想将它应用于 tm 语料库 "docs" 这是我将在 tm 生成的语料库上与 tm 包一起使用的示例语法。
docs <- tm_map(docs, function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm"))
我尝试的另一种语法:
LemmaTokenizer <- function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm")
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(4, 25), tokenize=LemmaTokenizer))
它给我一个错误:
Error in lemma_hashmap[[tokens]] :
attempt to select more than one element in vectorIndex
该函数适用于文本向量,但不适用于 tm 语料库。预先感谢您的建议(如果它不能与 tm 一起使用,甚至可以将此功能与其他文本挖掘包一起使用)。
我在这里看到两个问题。 1) 您的自定义函数 return 是一个列表,而它应该 return 是一个字符串向量;和 2) 你传递了一个错误的 lemma_hashmap 参数。
解决第一个问题的快速解决方法是在 return 函数结果之前使用 paste() 和 sapply()。
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
我们可以运行你的例子post。
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
[1] "Abadan abadańczyk OutOfVocabulary" "abadańczyk Abadan OutOfVocabulary"
现在,我们可以使用 tm_map。只需确保使用 lemma_hm(即变量)而不是 "lemma_hm"(字符串)作为参数。
docs <- SimpleCorpus(VectorSource(texts))
out <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
out[[1]]$content
[1] "Abadan abadańczyk OutOfVocabulary"
尝试使用 quanteda 的 dictionary()
函数,在创建字典后将每个变体映射为字典值,将词元作为字典键。下面,它会查找您的值,然后将标记粘贴回文本中。 (如果你想要令牌,你不需要最后的 paste()
操作。
txt <-
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami"
list_temp <- strsplit(readLines(textConnection(txt)), "\s+")
list_temp2 <- lapply(list_temp, "[", 2)
names(list_temp2) <- sapply(list_temp, "[", 1)
library("quanteda")
polish_lemma_dict <- dictionary(list_temp2)
# Dictionary object with 7 key entries.
# - Abadan:
# - abadanem
# - Abadan:
# - abadanie
# - Abadan:
# - abadanowi
# - Abadan:
# - abadanu
# - abadańczyk:
# - abadańczycy
# - abadańczyk:
# - abadańczykach
# - abadańczyk:
# - abadańczykami
texts <- c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
texts
现在可以转换为标记,并使用quanteda的tokens_lookup()
函数将字典值(变形词)替换为字典键(词条)。在最后一步中,我将标记重新粘贴在一起,如果您需要标记而不是全文,可以跳过这一步。
require(magrittr)
texts %>%
tokens() %>%
tokens_lookup(dictionary = polish_lemma_dict, exclusive = FALSE, capkeys = FALSE) %>%
as.character() %>%
paste(collapse = " ")
# [1] "Abadan abadańczyk OutOfVocabulary abadańczyk Abadan OutOfVocabulary"
波兰语词形还原请参考这个脚本 https://github.com/MarcinKosinski/trigeR5/blob/master/R/lematyzacja.R that uses this polmorfologik dictionary https://github.com/MarcinKosinski/trigeR5/tree/master/dicts(并且停用词也包含在那里)。
这是我在其中使用答案的完整的不完美代码。感谢许多人,我在底部描述了所有来源。这是非常粗糙的,我意识到,但它对我来说是错误的,即。我可以使用 txt lemmes 词典和我的停用词来对波兰语文本进行分类。感谢 Damiano Fantini、Dmitriy Selivanov 和许多其他人。
#----1. Set up. ----
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(readtext)
library(tm)
library(proxy)
library(stringi)
library(stringr)
library(hashmap)
library(data.table)
library(text2vec)
# For reading n-grams
library(RWeka) #(*)
BigramTokenizer <-
function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3)) #(*)
#----2. Read data. ----
stopwordsPL <- as.vector(str_split(readLines("polish.stopwords.text",encoding = "UTF-8"), pattern = " ",simplify = T))
docs <- VCorpus(DirSource(pattern="txt"))
titles <- rownames(summary(docs))
docs <- tm_map(docs, removeWords, words=stopwordsPL)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, function(x) stri_trans_general(x, "Latin-ASCII"))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# for English texts it would be simpler
# docs <- tm_map(docs, removeWords, stopwords("english")) #can add other words to remove
# docs <- tm_map(docs, stemDocument, "english")
#====3. Lemmatize ====
# # Dictionary from http://www.lexiconista.com/datasets/lemmatization/
# lemmadict_file = "lemmatization-pl.text"
# dt = fread(file= lemmadict_file, header = F, col.names = c("lemma", "word"), data.table=T, encoding="UTF-8")
# # I threw away Polish letters, maybe changing locales may help.
# dt$lemma <- stri_trans_general(dt$lemma, "Latin-ASCII;lower")
# dt$word <- stri_trans_general(dt$word, "Latin-ASCII;lower")
# dt <- unique(dt)
#
# # Creating hash dictionary
# lemma_hm = hashmap(dt$word, dt$lemma)
#
# # Test if it works
# lemma_hm[["mnozyl"]]
# # [1] "mnozyc"
#
# save_hashmap(lemma_hm, file="lemma_hm", overwrite = TRUE, compress = TRUE)
lemma_hm <- load_hashmap(file="lemma_hm")
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
docs <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
docs <- tm_map(docs, PlainTextDocument)
#====4. Create document term matrix====
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(5, 25),tokenize = BigramTokenizer)) # tokenize=LemmaTokenizer, tokenize = BigramTokenizer (*)
docsTDM$dimnames
#====5. Remove sparse and common words====
docsTDM <- removeSparseTerms(docsTDM, .90)
#
removeCommonTerms <- function (x, pct)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(pct), pct > 0, pct < 1)
m <- if (inherits(x, "DocumentTermMatrix"))
t(x)
else x
t <- table(m$i) < m$ncol * (pct)
termIndex <- as.numeric(names(t[t]))
if (inherits(x, "DocumentTermMatrix"))
x[, termIndex]
else x[termIndex, ]
}
docsTDM <-
removeCommonTerms(docsTDM, .8) #remove terms that are in >=80% of the documents
docsTDM$dimnames
#====6. Cluster data (hclust). ====
docsdissim <- dist(as.matrix(docsTDM), method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
dim(docsdissim2)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
h <- hclust(docsdissim, method = "ward.D2")
plot(h, labels = titles, sub = "")
# Library hclust with p-values (pvclust)
library(pvclust)
h_pv <- pvclust(docsdissim2, method.hclust = "ward.D2", method.dist ="correlation")
plot(h_pv)
data.frame(cutree(tree = h_pv$hclust, k = 4))
# pvclust provides two types of p-values: AU (Approximately Unbiased) p-value and BP (Bootstrap Probability) value.
# AU p-value, which is computed by multiscale bootstrap resampling, is a better approximation to unbiased p-value
# than BP value computed by normal bootstrap resampling.
# AU p-value > 0.95 we can assume the clusters exist and may stably be
# observed if we increase the number of observations.
# (http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/)
#==== Literature:====
# Original article:
# http://www.rexamine.com/2014/06/text-mining-in-r-automatic-categorization-of-wikipedia-articles/
# Updates to make it work after some functions became obsolete:
#
#
#
# Based on that:
# http://brazenly.blogspot.co.uk/2015/02/r-categorization-clustering-of.html
#
# Sparse terms:
#
# Lemmatizing function:
#
#