tm_map 删除包含停用词的单词?
tm_map to removewords containing my stop words?
我正在应用 removeWords
过滤这样的语料库:
corpus <- Corpus(vs, readerControl = list(language="en"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords("english")))
corpus <- tm_map(corpus, removeWords, bannedWords$V1)
然而,这只是匹配工作完全,所以:
- f*ck 已删除
- f*cking 没有被删除
我如何删除包含我的停用词的词?
我通过查看 tm
库 source code 的 removeWords 函数并从以下位置扩展正则表达式找到了答案:
gsub(sprintf("(*UCP)\b(%s)\b",
到
gsub(sprintf("(*UCP)\b[a-zA-Z]*(%s)[a-zA-Z]*\b",
功能齐全
removeWordsContaining <-
function(x, words)
UseMethod("removeWordsContaining", x)
removeWordsContaining.character <-
function(x, words)
gsub(sprintf("(*UCP)\b[a-zA-Z]*(%s)[a-zA-Z]*\b",
paste(sort(words, decreasing = TRUE), collapse = "|")),
"", x, perl = TRUE)
removeWordsContaining.PlainTextDocument <-
content_transformer(removeWordsContaining.character)
blog_corpus <- Corpus(vs, readerControl = list(language="en"))
blog_corpus <- tm_map(blog_corpus, content_transformer(tolower))
blog_corpus <- tm_map(blog_corpus, stripWhitespace)
blog_corpus <- tm_map(blog_corpus, removePunctuation)
blog_corpus <- tm_map(blog_corpus, removeNumbers)
blog_corpus <- tm_map(blog_corpus, removeWords, c(stopwords("english")))
blog_corpus <- tm_map(blog_corpus, removeWordsContaining, bannedWords$V1)
您可以使用词干提取将禁用词恢复为基本形式。请参阅下面的示例。
library(tm)
banned <- c("buck")
text <- c("He is bucking the trend", "A buck is not worth a dollar anymore!")
corpus <- Corpus(VectorSource(text), readerControl = list(language="en"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), banned))
writeLines(as.character(corpus[[1]]))
trend
如果您不阻止文档,您将获得:
corpus <- Corpus(VectorSource(text), readerControl = list(language="en"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), banned))
writeLines(as.character(corpus[[1]]))
bucking trend
我正在应用 removeWords
过滤这样的语料库:
corpus <- Corpus(vs, readerControl = list(language="en"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords("english")))
corpus <- tm_map(corpus, removeWords, bannedWords$V1)
然而,这只是匹配工作完全,所以:
- f*ck 已删除
- f*cking 没有被删除
我如何删除包含我的停用词的词?
我通过查看 tm
库 source code 的 removeWords 函数并从以下位置扩展正则表达式找到了答案:
gsub(sprintf("(*UCP)\b(%s)\b",
到
gsub(sprintf("(*UCP)\b[a-zA-Z]*(%s)[a-zA-Z]*\b",
功能齐全
removeWordsContaining <-
function(x, words)
UseMethod("removeWordsContaining", x)
removeWordsContaining.character <-
function(x, words)
gsub(sprintf("(*UCP)\b[a-zA-Z]*(%s)[a-zA-Z]*\b",
paste(sort(words, decreasing = TRUE), collapse = "|")),
"", x, perl = TRUE)
removeWordsContaining.PlainTextDocument <-
content_transformer(removeWordsContaining.character)
blog_corpus <- Corpus(vs, readerControl = list(language="en"))
blog_corpus <- tm_map(blog_corpus, content_transformer(tolower))
blog_corpus <- tm_map(blog_corpus, stripWhitespace)
blog_corpus <- tm_map(blog_corpus, removePunctuation)
blog_corpus <- tm_map(blog_corpus, removeNumbers)
blog_corpus <- tm_map(blog_corpus, removeWords, c(stopwords("english")))
blog_corpus <- tm_map(blog_corpus, removeWordsContaining, bannedWords$V1)
您可以使用词干提取将禁用词恢复为基本形式。请参阅下面的示例。
library(tm)
banned <- c("buck")
text <- c("He is bucking the trend", "A buck is not worth a dollar anymore!")
corpus <- Corpus(VectorSource(text), readerControl = list(language="en"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), banned))
writeLines(as.character(corpus[[1]]))
trend
如果您不阻止文档,您将获得:
corpus <- Corpus(VectorSource(text), readerControl = list(language="en"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, c(stopwords("english"), banned))
writeLines(as.character(corpus[[1]]))
bucking trend