使用 tm R 查找 2 个单词短语

Find 2 words phrase using tm R

我知道这个问题已经被问过多次了。例如

Finding 2 & 3 word Phrases Using R TM Package

但是,我不知道为什么 none 这些解决方案适用于我的数据。无论我为 ngram.

选择了多少个 ngram(2、3 或 4),结果始终是一克词

有谁知道原因吗?我怀疑编码是原因。

已编辑:一小部分数据。

comments <- c("Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into problem_70918\n", 
"Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into tm-247\n", 
"Merge branch 'php5.3-upgrade-sprint6-7' of git.internal.net:/git/pn-project/LegacyCodebase into release2012.08\n", 
"Merge remote-tracking branch 'dmann1/p71148-s3-callplan_mapping' into lcst-operational-changes\n", 
"Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into TASK-360148\n", 
"Merge remote-tracking branch 'grockett/rpr-pre' into rpr-lite\n"
)
cleanCorpus <- function(vector){
  corpus <- Corpus(VectorSource(vector), readerControl = list(language = "en_US"))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, tolower)
  #corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removePunctuation)
  #corpus <- tm_map(corpus, PlainTextDocument)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  return(corpus)
}
# this function is provided by a team member (in the link I posted above)
test <- function(keywords_doc){

  BigramTokenizer <-  function(x)
    unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
  # creating of document matrix
  keywords_matrix <- TermDocumentMatrix(keywords_doc, control = list(tokenize = BigramTokenizer))

  # remove sparse terms 
  keywords_naremoval <- removeSparseTerms(keywords_matrix, 0.99)

  # Frequency of the words appearing
  keyword.freq <- rowSums(as.matrix(keywords_naremoval))
  subsetkeyword.freq <-subset(keyword.freq, keyword.freq >=20)
  frequentKeywordSubsetDF <- data.frame(term = names(subsetkeyword.freq), freq = subsetkeyword.freq) 

  # Sorting of the words
  frequentKeywordDF <- data.frame(term = names(keyword.freq), freq = keyword.freq)
  frequentKeywordSubsetDF <- frequentKeywordSubsetDF[with(frequentKeywordSubsetDF, order(-frequentKeywordSubsetDF$freq)), ]
  frequentKeywordDF <- frequentKeywordDF[with(frequentKeywordDF, order(-frequentKeywordDF$freq)), ]

  # Printing of the words
  # wordcloud(frequentKeywordDF$term, freq=frequentKeywordDF$freq, random.order = FALSE, rot.per=0.35, scale=c(5,0.5), min.freq = 30, colors = brewer.pal(8,"Dark2"))
  return(frequentKeywordDF)
}

corpus <- cleanCorpus(comments)
t <- test(corpus)
> head(t)
             term freq
added       added    6
html         html    6
tracking tracking    6
common     common    4
emails     emails    4
template template    4

谢谢,

我也没有找到原因,但是如果您只对计数感兴趣,而不管双字母组出现在哪些文档中,您可以通过以下管道替代地获取它们:

library(tm)
lilbrary(dplyr)
library(quanteda)

# ..construct the corpus as in your post ...

corpus %>% 
  unlist() %>%  
  tokens() %>%
  tokens_ngrams(2:2, concatenator = " ") %>%  
  unlist() %>%  
  as.data.frame() %>% 
  group_by_(".") %>%  
  summarize(cnt=n()) %>%
  arrange(desc(cnt))