lemme 在 tm 包中的 R 聚合标记

R aggregate tocken by lemme in tm package

我在 R 中加载并清理了一个语料库:

myTxt <- Corpus(DirSource("."), readerControl = list(language="lat"))
corp <- tm_map(myTxt, removeWords, c(stopwords("french")))
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, content_transformer(removeNumbers))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, removeWords, stopwords("french"))
corp <- tm_map(corp, stripWhitespace); #inspect(docs[1])
tdm <- TermDocumentMatrix(corp)

并且使用 treetagger 我写了一个像这样的函数:

require(koRpus)

lemmatisation <- function(my.df){
  ##my.df est un objet Corpus issu de du chargement du corpus avec tm
  print(my.df)
  dictionnaire <- data.frame()
  for(i in 1 : length(my.df)){
    lemma <- treetag(corp[[i]][[1]], treetagger = "manual", format = "obj", TT.tknz = FALSE, 
                     lang = "fr", TT.options = list(path = "treetagger", preset = "fr-utf8"))
    dictionnaire <- rbind(dictionnaire, lemma@TT.res )
  }
  return(unique(dictionnaire))
}

此时我 tdm 有类似

的东西
                                                 Docs
Terms                                              Urbain.txt Versele.txt
  sudest                                                    0           1
  suit                                                      0           0
  suivi                                                     0           0
  sujets                                                    0           0
  supplémentaire                                            0           0
  suzanne                                                   0           0
  symbols                                                   0           0
  tant                                                      0           0
  tdm                                                       0           0
  télévisés                                                 0           0
  tempérament                                               0           0
  temps                                                     1           0
  termdocumentmatrixcorp                                    0           0
  terms                                                     0           0
  terre                                                     0           0
  tête                                                      0           0
  text                                                      0           0
  textcat                                                   0           0
  the                                                       0           1
  théâtre                                                   0           0
  thème                                                     0           0
  themebw                                                   0           0
  thérapeute                                                0           0
  thérapie                                                  0           0
  thèse                                                     0           0
  tissent                                                   0           0
  tmmapcorp                                                 0           0
  tmmapmytxt                                                0           0
  tokyo                                                     0           0
  tôt                                                       0           0
  touchent                                                  0           0
  toujours                                                  0           0
  tournant                                                  0           0
  tous                                                      0           0
  tout                                                      0           0
  toute                                                     0           0
  toutes                                                    0           0
  traditionnelle                                            0           1
  transformé                                                0           0
  travail                                                   0           0
  travaillant                                               0           1
  travaille                                                 0           0
  travaillé                                                 0           0
  travaillent                                               0           0

现在我想用我的词典来汇总字数统计,以便对 travaillé、travaille、travaillant、travaillent 进行分组...

在我的函数词形还原的结果中我有:

my.lemma[my.lemma$lemma == "travailler",]
           token      tag      lemma lttr wclass                    desc stop stem
665    travaillé VER:pper travailler    9   verb    verb past participle   NA   NA
835    travaille VER:pres travailler    9   verb            verb present   NA   NA
1369 travaillent VER:pres travailler   11   verb            verb present   NA   NA
1713 travaillant VER:ppre travailler   11   verb verb present participle   NA   NA

我不知道如何进行此聚合

你可以试试

aggregate(.~lemma, merge(tdm, mylemma[, c("token", "lemma")], by.x="row.names", by.y="token")[-1], sum)

应该会给你类似的东西

#        lemma Urbain.txt Versele.txt
# 1 travailler          0           1
# ...

另一种方法是使用 dplyr。

我将所有代码放在一个 R 函数中:lemmatization.tdm

corpus 是来自 tm::Corpus 的对象,lang 是语言参数(有关可用语言,请参阅帮助(koRpus::kRp.POS.tags)),treetaggerfilepath 是您安装了 treetagger 的文件(see).

lemmatization.tdm <- function( corpus, lang = "fr",
                           treetaggerfilepath = "~/Programs/treetagger/" )
{
# get packages
  require( koRpus ) ;
  require( tm ) ;
  require( dplyr ) ;

# run treetager
  dictionnaire <- data.frame()
  for(i in 1:length(corpus) )
  {
    lemma <- treetag(corpus[[i]][[1]], 
                     treetagger = "manual", 
                     format = "obj", TT.tknz = FALSE, 
                     lang = lang, 
                     TT.options = list(path = treetaggerfilepath, 
                                       preset = lang )
                     )
    dictionnaire <- rbind(dictionnaire, lemma@TT.res )
  } ;
# select outpu from treetagger (token are originals words and lemma the lemmas)
  dictionnaire[, c("token", "lemma")]  -> dictionnaire ;
 # treetagger give sometimes more than one ouput, you need select one.
 dictionnaire %>% select(., token) %>% unique() %>% row.names() -> rownames1 ;
  dictionnaire[ as.numeric( rownames1 ), ] -> dictionnaire ;

# prepare classic tdm
  TermDocumentMatrix(corpus ) %>% 
    as.matrix(.) %>%
    as.data.frame(., stringsAsFactors = FALSE )  -> 
    tdm ;
# bind terms in data
  cbind( token = rownames( tdm ), tdm ) ->
    tdm ;

# bind dictionnaire and tdm on terms (token)
  right_join( dictionnaire, tdm,  
         by = "token" ) -> tdm.lemma ;

# agreggate count for lemmas
  tdm.lemma[,-1] %>% 
    as.data.frame(.) %>%
    group_by(., lemma) %>%
    summarise_all(., funs( sum( na_if(., 0L), na.rm = TRUE ) ) ) -> 
    tdm.lemma1 ;

#prepare output
  rownames( tdm.lemma1 ) <- tdm.lemma1$lemma  ;

  as.matrix( tdm.lemma1[, -1] ) -> tdm.lemma1 ;

  return( tdm.lemma1 ) ;
} ;

# return a tdm in matrix class. 

tdm %>% rowSums %>% sort(., decreasing = TRUE) -> countofterm # to see results by terms  

countofterm %>% sum -> numberofanalyzedwords
countofterm / numberofanalyzedwords -> freqterm

freqterm %>% filter(., freqterm > 0.1) # get words with a frequency above 0.1