R tm removeWords 函数不删除单词
R tm removeWords function not removing words
我正在尝试从我构建的语料库中删除一些词,但它似乎不起作用。我首先 运行 遍历所有内容并创建一个数据框,按出现频率列出我的单词。我使用此列表来识别我不感兴趣的词,然后尝试创建一个新列表并删除这些词。但是,这些词保留在我的数据集中。我想知道我做错了什么以及为什么没有删除这些词?我在下面包含了完整的代码:
install.packages("rvest")
install.packages("tm")
install.packages("SnowballC")
install.packages("stringr")
library(stringr)
library(tm)
library(SnowballC)
library(rvest)
# Pull in the data I have been using.
paperList <- html("http://journals.plos.org/plosone/search?q=nutrigenomics&sortOrder=RELEVANCE&filterJournals=PLoSONE&resultsPerPage=192")
paperURLs <- paperList %>%
html_nodes(xpath="//*[@class='search-results-title']/a") %>%
html_attr("href")
paperURLs <- paste("http://journals.plos.org", paperURLs, sep = "")
paper_html <- sapply(1:length(paperURLs), function(x) html(paperURLs[x]))
paperText <- sapply(1:length(paper_html), function(x) paper_html[[1]] %>%
html_nodes(xpath="//*[@class='article-content']") %>%
html_text() %>%
str_trim(.))
# Create corpus
paperCorp <- Corpus(VectorSource(paperText))
for(j in seq(paperCorp))
{
paperCorp[[j]] <- gsub(":", " ", paperCorp[[j]])
paperCorp[[j]] <- gsub("\n", " ", paperCorp[[j]])
paperCorp[[j]] <- gsub("-", " ", paperCorp[[j]])
}
paperCorp <- tm_map(paperCorp, removePunctuation)
paperCorp <- tm_map(paperCorp, removeNumbers)
paperCorp <- tm_map(paperCorp, removeWords, stopwords("english"))
paperCorp <- tm_map(paperCorp, stemDocument)
paperCorp <- tm_map(paperCorp, stripWhitespace)
paperCorpPTD <- tm_map(paperCorp, PlainTextDocument)
dtm <- DocumentTermMatrix(paperCorpPTD)
termFreq <- colSums(as.matrix(dtm))
head(termFreq)
tf <- data.frame(term = names(termFreq), freq = termFreq)
tf <- tf[order(-tf[,2]),]
head(tf)
# After having identified words I am not interested in
# create new corpus with these words removed.
paperCorp1 <- tm_map(paperCorp, removeWords, c("also", "article", "Article",
"download", "google", "figure",
"fig", "groups","Google", "however",
"high", "human", "levels",
"larger", "may", "number",
"shown", "study", "studies", "this",
"using", "two", "the", "Scholar",
"pubmedncbi", "PubMedNCBI",
"view", "View", "the", "biol",
"via", "image", "doi", "one",
"analysis"))
paperCorp1 <- tm_map(paperCorp1, stripWhitespace)
paperCorpPTD1 <- tm_map(paperCorp1, PlainTextDocument)
dtm1 <- DocumentTermMatrix(paperCorpPTD1)
termFreq1 <- colSums(as.matrix(dtm1))
tf1 <- data.frame(term = names(termFreq1), freq = termFreq1)
tf1 <- tf1[order(-tf1[,2]),]
head(tf1, 100)
如果您浏览 tf1
,您会发现许多指定要删除的词实际上并未被删除。
只是想知道我做错了什么,以及如何从我的数据中删除这些词?
注意:removeWords
正在做一些事情,因为 head(tm, 100)
和 head(tm1, 100)
的输出不完全相同。所以 removeWords
似乎删除了我试图删除的单词的一些实例,但不是所有实例。
我切换了一些代码并添加到下面。停用词都是小写的,所以在删除停用词之前你需要先这样做。
paperCorp <- tm_map(paperCorp, removePunctuation)
paperCorp <- tm_map(paperCorp, removeNumbers)
# added tolower
paperCorp <- tm_map(paperCorp, tolower)
paperCorp <- tm_map(paperCorp, removeWords, stopwords("english"))
# moved stripWhitespace
paperCorp <- tm_map(paperCorp, stripWhitespace)
paperCorp <- tm_map(paperCorp, stemDocument)
不再需要大写单词,因为我们将所有内容都设置为小写。您可以删除这些。
paperCorp <- tm_map(paperCorp, removeWords, c("also", "article", "Article",
"download", "google", "figure",
"fig", "groups","Google", "however",
"high", "human", "levels",
"larger", "may", "number",
"shown", "study", "studies", "this",
"using", "two", "the", "Scholar",
"pubmedncbi", "PubMedNCBI",
"view", "View", "the", "biol",
"via", "image", "doi", "one",
"analysis"))
paperCorpPTD <- tm_map(paperCorp, PlainTextDocument)
dtm <- DocumentTermMatrix(paperCorpPTD)
termFreq <- colSums(as.matrix(dtm))
head(termFreq)
tf <- data.frame(term = names(termFreq), freq = termFreq)
tf <- tf[order(-tf[,2]),]
head(tf)
term freq
fatty fatty 29568
pparα ppara 23232
acids acids 22848
gene gene 15360
dietary dietary 12864
scholar scholar 11904
tf[tf$term == "study"]
data frame with 0 columns and 1659 rows
如您所见,结果是语料库中不再有研究。剩下的字也没有了
如果有人遇到像我这样的错误并且上述解决方案仍然无效,请尝试使用:
paperCorp <- tm_map(paperCorp, content_transformer(tolower))
而不是 paperCorp <- tm_map(paperCorp, tolower)
因为 tolower()
是来自基础包的函数和 returns 不同的结构(我的意思是改变结果类型中的某些东西)所以你不能使用例如paperCorp[[j]]$content
但只有 paperCorp[[j]]
。题外话,可能对某些人有帮助。
我正在尝试从我构建的语料库中删除一些词,但它似乎不起作用。我首先 运行 遍历所有内容并创建一个数据框,按出现频率列出我的单词。我使用此列表来识别我不感兴趣的词,然后尝试创建一个新列表并删除这些词。但是,这些词保留在我的数据集中。我想知道我做错了什么以及为什么没有删除这些词?我在下面包含了完整的代码:
install.packages("rvest")
install.packages("tm")
install.packages("SnowballC")
install.packages("stringr")
library(stringr)
library(tm)
library(SnowballC)
library(rvest)
# Pull in the data I have been using.
paperList <- html("http://journals.plos.org/plosone/search?q=nutrigenomics&sortOrder=RELEVANCE&filterJournals=PLoSONE&resultsPerPage=192")
paperURLs <- paperList %>%
html_nodes(xpath="//*[@class='search-results-title']/a") %>%
html_attr("href")
paperURLs <- paste("http://journals.plos.org", paperURLs, sep = "")
paper_html <- sapply(1:length(paperURLs), function(x) html(paperURLs[x]))
paperText <- sapply(1:length(paper_html), function(x) paper_html[[1]] %>%
html_nodes(xpath="//*[@class='article-content']") %>%
html_text() %>%
str_trim(.))
# Create corpus
paperCorp <- Corpus(VectorSource(paperText))
for(j in seq(paperCorp))
{
paperCorp[[j]] <- gsub(":", " ", paperCorp[[j]])
paperCorp[[j]] <- gsub("\n", " ", paperCorp[[j]])
paperCorp[[j]] <- gsub("-", " ", paperCorp[[j]])
}
paperCorp <- tm_map(paperCorp, removePunctuation)
paperCorp <- tm_map(paperCorp, removeNumbers)
paperCorp <- tm_map(paperCorp, removeWords, stopwords("english"))
paperCorp <- tm_map(paperCorp, stemDocument)
paperCorp <- tm_map(paperCorp, stripWhitespace)
paperCorpPTD <- tm_map(paperCorp, PlainTextDocument)
dtm <- DocumentTermMatrix(paperCorpPTD)
termFreq <- colSums(as.matrix(dtm))
head(termFreq)
tf <- data.frame(term = names(termFreq), freq = termFreq)
tf <- tf[order(-tf[,2]),]
head(tf)
# After having identified words I am not interested in
# create new corpus with these words removed.
paperCorp1 <- tm_map(paperCorp, removeWords, c("also", "article", "Article",
"download", "google", "figure",
"fig", "groups","Google", "however",
"high", "human", "levels",
"larger", "may", "number",
"shown", "study", "studies", "this",
"using", "two", "the", "Scholar",
"pubmedncbi", "PubMedNCBI",
"view", "View", "the", "biol",
"via", "image", "doi", "one",
"analysis"))
paperCorp1 <- tm_map(paperCorp1, stripWhitespace)
paperCorpPTD1 <- tm_map(paperCorp1, PlainTextDocument)
dtm1 <- DocumentTermMatrix(paperCorpPTD1)
termFreq1 <- colSums(as.matrix(dtm1))
tf1 <- data.frame(term = names(termFreq1), freq = termFreq1)
tf1 <- tf1[order(-tf1[,2]),]
head(tf1, 100)
如果您浏览 tf1
,您会发现许多指定要删除的词实际上并未被删除。
只是想知道我做错了什么,以及如何从我的数据中删除这些词?
注意:removeWords
正在做一些事情,因为 head(tm, 100)
和 head(tm1, 100)
的输出不完全相同。所以 removeWords
似乎删除了我试图删除的单词的一些实例,但不是所有实例。
我切换了一些代码并添加到下面。停用词都是小写的,所以在删除停用词之前你需要先这样做。
paperCorp <- tm_map(paperCorp, removePunctuation)
paperCorp <- tm_map(paperCorp, removeNumbers)
# added tolower
paperCorp <- tm_map(paperCorp, tolower)
paperCorp <- tm_map(paperCorp, removeWords, stopwords("english"))
# moved stripWhitespace
paperCorp <- tm_map(paperCorp, stripWhitespace)
paperCorp <- tm_map(paperCorp, stemDocument)
不再需要大写单词,因为我们将所有内容都设置为小写。您可以删除这些。
paperCorp <- tm_map(paperCorp, removeWords, c("also", "article", "Article",
"download", "google", "figure",
"fig", "groups","Google", "however",
"high", "human", "levels",
"larger", "may", "number",
"shown", "study", "studies", "this",
"using", "two", "the", "Scholar",
"pubmedncbi", "PubMedNCBI",
"view", "View", "the", "biol",
"via", "image", "doi", "one",
"analysis"))
paperCorpPTD <- tm_map(paperCorp, PlainTextDocument)
dtm <- DocumentTermMatrix(paperCorpPTD)
termFreq <- colSums(as.matrix(dtm))
head(termFreq)
tf <- data.frame(term = names(termFreq), freq = termFreq)
tf <- tf[order(-tf[,2]),]
head(tf)
term freq
fatty fatty 29568
pparα ppara 23232
acids acids 22848
gene gene 15360
dietary dietary 12864
scholar scholar 11904
tf[tf$term == "study"]
data frame with 0 columns and 1659 rows
如您所见,结果是语料库中不再有研究。剩下的字也没有了
如果有人遇到像我这样的错误并且上述解决方案仍然无效,请尝试使用:
paperCorp <- tm_map(paperCorp, content_transformer(tolower))
而不是 paperCorp <- tm_map(paperCorp, tolower)
因为 tolower()
是来自基础包的函数和 returns 不同的结构(我的意思是改变结果类型中的某些东西)所以你不能使用例如paperCorp[[j]]$content
但只有 paperCorp[[j]]
。题外话,可能对某些人有帮助。