tm_map 删除包含停用词的单词?

tm_map to removewords containing my stop words?

我正在应用 removeWords 过滤这样的语料库:

corpus <- Corpus(vs, readerControl = list(language="en")) 
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace) 
corpus <- tm_map(corpus, removeWords, c(stopwords("english"))) 
corpus <- tm_map(corpus, removeWords, bannedWords$V1) 

然而,这只是匹配工作完全,所以:

我如何删除包含我的停用词的词?

我通过查看 tmsource code 的 removeWords 函数并从以下位置扩展正则表达式找到了答案:

gsub(sprintf("(*UCP)\b(%s)\b",

gsub(sprintf("(*UCP)\b[a-zA-Z]*(%s)[a-zA-Z]*\b",

功能齐全

removeWordsContaining <-
function(x, words)
    UseMethod("removeWordsContaining", x)
removeWordsContaining.character <-
function(x, words)
    gsub(sprintf("(*UCP)\b[a-zA-Z]*(%s)[a-zA-Z]*\b",
                 paste(sort(words, decreasing = TRUE), collapse = "|")),
         "", x, perl = TRUE)
removeWordsContaining.PlainTextDocument <-
    content_transformer(removeWordsContaining.character)

blog_corpus <- Corpus(vs, readerControl = list(language="en")) 
blog_corpus <- tm_map(blog_corpus, content_transformer(tolower))
blog_corpus <- tm_map(blog_corpus, stripWhitespace) 
blog_corpus <- tm_map(blog_corpus, removePunctuation) 
blog_corpus <- tm_map(blog_corpus, removeNumbers) 
blog_corpus <- tm_map(blog_corpus, removeWords, c(stopwords("english"))) 
blog_corpus <- tm_map(blog_corpus, removeWordsContaining, bannedWords$V1) 

您可以使用词干提取将禁用词恢复为基本形式。请参阅下面的示例。

library(tm)

banned <- c("buck")
text <- c("He is bucking the trend", "A buck is not worth a dollar anymore!")

corpus <- Corpus(VectorSource(text), readerControl = list(language="en")) 
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace) 
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), banned)) 

writeLines(as.character(corpus[[1]]))
  trend

如果您不阻止文档,您将获得:

corpus <- Corpus(VectorSource(text), readerControl = list(language="en")) 
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace) 
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), banned)) 

writeLines(as.character(corpus[[1]]))
  bucking  trend