tm Bigrams 解决方法仍然产生 unigrams

tm Bigrams workaround still producing unigrams

我正在尝试使用 tm 的 DocumentTermMatrix 函数生成一个包含双字母组而不是单字母组的矩阵。我尝试在我的函数中使用 here and here 概述的示例(这里有三个示例):

make_dtm = function(main_df, stem=F){
  tokenize_ngrams = function(x, n=2) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
  decisions = Corpus(VectorSource(main_df$CaseTranscriptText))
  decisions.dtm = DocumentTermMatrix(decisions, control = list(tokenize=tokenize_ngrams,
                                                           stopwords=T,
                                                           tolower=T,
                                                           removeNumbers=T,
                                                           removePunctuation=T,
                                                           stemming = stem))
  return(decisions.dtm)
}

make_dtm = function(main_df, stem=F){
  BigramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
  decisions = Corpus(VectorSource(main_df$CaseTranscriptText))
  decisions.dtm = DocumentTermMatrix(decisions, control = list(tokenize=BigramTokenizer,
                                                           stopwords=T,
                                                           tolower=T,
                                                           removeNumbers=T,
                                                           removePunctuation=T,
                                                           stemming = stem))
  return(decisions.dtm)
}

make_dtm = function(main_df, stem=F){
  BigramTokenizer = function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
  decisions = Corpus(VectorSource(main_df$CaseTranscriptText))
  decisions.dtm = DocumentTermMatrix(decisions, control = list(tokenize=BigramTokenizer,
                                                           stopwords=T,
                                                           tolower=T,
                                                           removeNumbers=T,
                                                           removePunctuation=T,
                                                           stemming = stem))
  return(decisions.dtm)
}

然而,不幸的是,函数的这三个版本中的每一个都产生完全相同的输出:具有一元字母而不是双字母的 DTM(为简单起见包含图像):

为了您的方便,这里是我正在处理的数据的一个子集:

x = data.frame("CaseName" = c("Attorney General's Reference (No.23 of 2011)", "Attorney General's Reference (No.31 of 2016)", "Joseph Hill & Co Solicitors, Re"),
               "CaseID"= c("[2011]EWCACrim1496", "[2016]EWCACrim1386", "[2013]EWCACrim775"),
               "CaseTranscriptText" = c("sanchez 2011 02187 6 appeal criminal division 8 2011 2011 ewca crim 14962011 wl 844075 wales wednesday 8 2011 attorney general reference 23 2011 36 criminal act 1988 representation qc general qc appeared behalf attorney general", 
                                        "attorney general reference 31 2016 201601021 2 appeal criminal division 20 2016 2016 ewca crim 13862016 wl 05335394 dbe honour qc sitting cacd wednesday 20 th 2016 reference attorney general 36 criminal act 1988 representation",
                                        "matter wasted costs against company solicitors 201205544 5 appeal criminal division 21 2013 2013 ewca crim 7752013 wl 2110641 date 21 05 2013 appeal honour pawlak 20111354 hearing date 13 th 2013 representation toole respondent qc appellants"))

您的代码存在一些问题。我只关注您创建的最后一个函数,因为我不使用 tau 或 Rweka 包。

1 要使用分词器,您需要指定 tokenizer = ...,而不是 tokenize = ...

2 而不是 Corpus 你需要 VCorpus

3 在您的函数 make_dtm 中进行调整后,我对结果并不满意。并非控制选项中指定的所有内容都得到正确处理。我创建了第二个函数 make_dtm_adjusted 以便您可以看到两者之间的差异。

# OP's function adjusted to make it work
make_dtm = function(main_df, stem=F){
  BigramTokenizer = function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
  decisions = VCorpus(VectorSource(main_df$CaseTranscriptText))
  decisions.dtm = DocumentTermMatrix(decisions, control = list(tokenizer=BigramTokenizer,
                                                           stopwords=T,
                                                           tolower=T,
                                                           removeNumbers=T,
                                                           removePunctuation=T,
                                                           stemming = stem))
  return(decisions.dtm)
}

# improved function
make_dtm_adjusted = function(main_df, stem=F){
  BigramTokenizer = function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
  decisions = VCorpus(VectorSource(main_df$CaseTranscriptText))

  decisions <- tm_map(decisions, content_transformer(tolower))
  decisions <- tm_map(decisions, removeNumbers)
  decisions <- tm_map(decisions, removePunctuation)
  # specifying your own stopword list is better as you can use stopwords("smart")
  # or your own list
  decisions <- tm_map(decisions, removeWords, stopwords("english")) 
  decisions <- tm_map(decisions, stripWhitespace)

  decisions.dtm = DocumentTermMatrix(decisions, control = list(stemming = stem,
                                                               tokenizer=BigramTokenizer))
  return(decisions.dtm)
}