lemme 在 tm 包中的 R 聚合标记
R aggregate tocken by lemme in tm package
我在 R 中加载并清理了一个语料库:
myTxt <- Corpus(DirSource("."), readerControl = list(language="lat"))
corp <- tm_map(myTxt, removeWords, c(stopwords("french")))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, content_transformer(removeNumbers))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, removeWords, stopwords("french"))
corp <- tm_map(corp, stripWhitespace); #inspect(docs[1])
tdm <- TermDocumentMatrix(corp)
并且使用 treetagger 我写了一个像这样的函数:
require(koRpus)
lemmatisation <- function(my.df){
##my.df est un objet Corpus issu de du chargement du corpus avec tm
print(my.df)
dictionnaire <- data.frame()
for(i in 1 : length(my.df)){
lemma <- treetag(corp[[i]][[1]], treetagger = "manual", format = "obj", TT.tknz = FALSE,
lang = "fr", TT.options = list(path = "treetagger", preset = "fr-utf8"))
dictionnaire <- rbind(dictionnaire, lemma@TT.res )
}
return(unique(dictionnaire))
}
此时我 tdm
有类似
的东西
Docs
Terms Urbain.txt Versele.txt
sudest 0 1
suit 0 0
suivi 0 0
sujets 0 0
supplémentaire 0 0
suzanne 0 0
symbols 0 0
tant 0 0
tdm 0 0
télévisés 0 0
tempérament 0 0
temps 1 0
termdocumentmatrixcorp 0 0
terms 0 0
terre 0 0
tête 0 0
text 0 0
textcat 0 0
the 0 1
théâtre 0 0
thème 0 0
themebw 0 0
thérapeute 0 0
thérapie 0 0
thèse 0 0
tissent 0 0
tmmapcorp 0 0
tmmapmytxt 0 0
tokyo 0 0
tôt 0 0
touchent 0 0
toujours 0 0
tournant 0 0
tous 0 0
tout 0 0
toute 0 0
toutes 0 0
traditionnelle 0 1
transformé 0 0
travail 0 0
travaillant 0 1
travaille 0 0
travaillé 0 0
travaillent 0 0
现在我想用我的词典来汇总字数统计,以便对 travaillé、travaille、travaillant、travaillent 进行分组...
在我的函数词形还原的结果中我有:
my.lemma[my.lemma$lemma == "travailler",]
token tag lemma lttr wclass desc stop stem
665 travaillé VER:pper travailler 9 verb verb past participle NA NA
835 travaille VER:pres travailler 9 verb verb present NA NA
1369 travaillent VER:pres travailler 11 verb verb present NA NA
1713 travaillant VER:ppre travailler 11 verb verb present participle NA NA
我不知道如何进行此聚合
你可以试试
aggregate(.~lemma, merge(tdm, mylemma[, c("token", "lemma")], by.x="row.names", by.y="token")[-1], sum)
应该会给你类似的东西
# lemma Urbain.txt Versele.txt
# 1 travailler 0 1
# ...
另一种方法是使用 dplyr。
我将所有代码放在一个 R 函数中:lemmatization.tdm
corpus 是来自 tm::Corpus 的对象,lang 是语言参数(有关可用语言,请参阅帮助(koRpus::kRp.POS.tags)),treetaggerfilepath 是您安装了 treetagger 的文件(see).
lemmatization.tdm <- function( corpus, lang = "fr",
treetaggerfilepath = "~/Programs/treetagger/" )
{
# get packages
require( koRpus ) ;
require( tm ) ;
require( dplyr ) ;
# run treetager
dictionnaire <- data.frame()
for(i in 1:length(corpus) )
{
lemma <- treetag(corpus[[i]][[1]],
treetagger = "manual",
format = "obj", TT.tknz = FALSE,
lang = lang,
TT.options = list(path = treetaggerfilepath,
preset = lang )
)
dictionnaire <- rbind(dictionnaire, lemma@TT.res )
} ;
# select outpu from treetagger (token are originals words and lemma the lemmas)
dictionnaire[, c("token", "lemma")] -> dictionnaire ;
# treetagger give sometimes more than one ouput, you need select one.
dictionnaire %>% select(., token) %>% unique() %>% row.names() -> rownames1 ;
dictionnaire[ as.numeric( rownames1 ), ] -> dictionnaire ;
# prepare classic tdm
TermDocumentMatrix(corpus ) %>%
as.matrix(.) %>%
as.data.frame(., stringsAsFactors = FALSE ) ->
tdm ;
# bind terms in data
cbind( token = rownames( tdm ), tdm ) ->
tdm ;
# bind dictionnaire and tdm on terms (token)
right_join( dictionnaire, tdm,
by = "token" ) -> tdm.lemma ;
# agreggate count for lemmas
tdm.lemma[,-1] %>%
as.data.frame(.) %>%
group_by(., lemma) %>%
summarise_all(., funs( sum( na_if(., 0L), na.rm = TRUE ) ) ) ->
tdm.lemma1 ;
#prepare output
rownames( tdm.lemma1 ) <- tdm.lemma1$lemma ;
as.matrix( tdm.lemma1[, -1] ) -> tdm.lemma1 ;
return( tdm.lemma1 ) ;
} ;
# return a tdm in matrix class.
tdm %>% rowSums %>% sort(., decreasing = TRUE) -> countofterm # to see results by terms
countofterm %>% sum -> numberofanalyzedwords
countofterm / numberofanalyzedwords -> freqterm
freqterm %>% filter(., freqterm > 0.1) # get words with a frequency above 0.1
我在 R 中加载并清理了一个语料库:
myTxt <- Corpus(DirSource("."), readerControl = list(language="lat"))
corp <- tm_map(myTxt, removeWords, c(stopwords("french")))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, content_transformer(removeNumbers))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, removeWords, stopwords("french"))
corp <- tm_map(corp, stripWhitespace); #inspect(docs[1])
tdm <- TermDocumentMatrix(corp)
并且使用 treetagger 我写了一个像这样的函数:
require(koRpus)
lemmatisation <- function(my.df){
##my.df est un objet Corpus issu de du chargement du corpus avec tm
print(my.df)
dictionnaire <- data.frame()
for(i in 1 : length(my.df)){
lemma <- treetag(corp[[i]][[1]], treetagger = "manual", format = "obj", TT.tknz = FALSE,
lang = "fr", TT.options = list(path = "treetagger", preset = "fr-utf8"))
dictionnaire <- rbind(dictionnaire, lemma@TT.res )
}
return(unique(dictionnaire))
}
此时我 tdm
有类似
Docs
Terms Urbain.txt Versele.txt
sudest 0 1
suit 0 0
suivi 0 0
sujets 0 0
supplémentaire 0 0
suzanne 0 0
symbols 0 0
tant 0 0
tdm 0 0
télévisés 0 0
tempérament 0 0
temps 1 0
termdocumentmatrixcorp 0 0
terms 0 0
terre 0 0
tête 0 0
text 0 0
textcat 0 0
the 0 1
théâtre 0 0
thème 0 0
themebw 0 0
thérapeute 0 0
thérapie 0 0
thèse 0 0
tissent 0 0
tmmapcorp 0 0
tmmapmytxt 0 0
tokyo 0 0
tôt 0 0
touchent 0 0
toujours 0 0
tournant 0 0
tous 0 0
tout 0 0
toute 0 0
toutes 0 0
traditionnelle 0 1
transformé 0 0
travail 0 0
travaillant 0 1
travaille 0 0
travaillé 0 0
travaillent 0 0
现在我想用我的词典来汇总字数统计,以便对 travaillé、travaille、travaillant、travaillent 进行分组...
在我的函数词形还原的结果中我有:
my.lemma[my.lemma$lemma == "travailler",]
token tag lemma lttr wclass desc stop stem
665 travaillé VER:pper travailler 9 verb verb past participle NA NA
835 travaille VER:pres travailler 9 verb verb present NA NA
1369 travaillent VER:pres travailler 11 verb verb present NA NA
1713 travaillant VER:ppre travailler 11 verb verb present participle NA NA
我不知道如何进行此聚合
你可以试试
aggregate(.~lemma, merge(tdm, mylemma[, c("token", "lemma")], by.x="row.names", by.y="token")[-1], sum)
应该会给你类似的东西
# lemma Urbain.txt Versele.txt
# 1 travailler 0 1
# ...
另一种方法是使用 dplyr。
我将所有代码放在一个 R 函数中:lemmatization.tdm
corpus 是来自 tm::Corpus 的对象,lang 是语言参数(有关可用语言,请参阅帮助(koRpus::kRp.POS.tags)),treetaggerfilepath 是您安装了 treetagger 的文件(see).
lemmatization.tdm <- function( corpus, lang = "fr",
treetaggerfilepath = "~/Programs/treetagger/" )
{
# get packages
require( koRpus ) ;
require( tm ) ;
require( dplyr ) ;
# run treetager
dictionnaire <- data.frame()
for(i in 1:length(corpus) )
{
lemma <- treetag(corpus[[i]][[1]],
treetagger = "manual",
format = "obj", TT.tknz = FALSE,
lang = lang,
TT.options = list(path = treetaggerfilepath,
preset = lang )
)
dictionnaire <- rbind(dictionnaire, lemma@TT.res )
} ;
# select outpu from treetagger (token are originals words and lemma the lemmas)
dictionnaire[, c("token", "lemma")] -> dictionnaire ;
# treetagger give sometimes more than one ouput, you need select one.
dictionnaire %>% select(., token) %>% unique() %>% row.names() -> rownames1 ;
dictionnaire[ as.numeric( rownames1 ), ] -> dictionnaire ;
# prepare classic tdm
TermDocumentMatrix(corpus ) %>%
as.matrix(.) %>%
as.data.frame(., stringsAsFactors = FALSE ) ->
tdm ;
# bind terms in data
cbind( token = rownames( tdm ), tdm ) ->
tdm ;
# bind dictionnaire and tdm on terms (token)
right_join( dictionnaire, tdm,
by = "token" ) -> tdm.lemma ;
# agreggate count for lemmas
tdm.lemma[,-1] %>%
as.data.frame(.) %>%
group_by(., lemma) %>%
summarise_all(., funs( sum( na_if(., 0L), na.rm = TRUE ) ) ) ->
tdm.lemma1 ;
#prepare output
rownames( tdm.lemma1 ) <- tdm.lemma1$lemma ;
as.matrix( tdm.lemma1[, -1] ) -> tdm.lemma1 ;
return( tdm.lemma1 ) ;
} ;
# return a tdm in matrix class.
tdm %>% rowSums %>% sort(., decreasing = TRUE) -> countofterm # to see results by terms
countofterm %>% sum -> numberofanalyzedwords
countofterm / numberofanalyzedwords -> freqterm
freqterm %>% filter(., freqterm > 0.1) # get words with a frequency above 0.1