R - 维基百科文章的自动分类
R - Automatic categorization of Wikipedia articles
我一直在尝试关注 Norbert Ryciak 的 example,但我一直无法联系到他。
自从这篇文章写于 2014 年以来,R 中的一些东西发生了变化,所以我能够更新代码中的一些东西,但我卡在了最后一部分。
到目前为止,这是我的工作代码:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- TermDocumentMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
但是我没能通过这部分:
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward.D")
plot(h, labels = titles, sub = "")
我尝试直接 运行 "hclust",然后我能够绘制,但没有任何可读的结果。
这是我遇到的错误:
rownames(docsdissim2) <- titles
Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral", :
length of 'dimnames' [1] not equal to array extent
另一个:
plot(h, labels = titles, sub = "")
Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
invalid dendrogram input
有没有人可以帮我完成这个例子?
此致,
感谢 Norbert Ryciak(本教程的作者),我得以解决这个问题。
因为他使用的是旧版本的"tm"(可能是当时最新的),所以与我使用的版本不兼容。
解决方案是将 "docsTDM <- TermDocumentMatrix(docs8)" 替换为 "docsTDM <- DocumentTermMatrix(docs8)"。
所以最终代码:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- DocumentTermMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward")
plot(h, labels = titles, sub = "")
我一直在尝试关注 Norbert Ryciak 的 example,但我一直无法联系到他。
自从这篇文章写于 2014 年以来,R 中的一些东西发生了变化,所以我能够更新代码中的一些东西,但我卡在了最后一部分。
到目前为止,这是我的工作代码:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- TermDocumentMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
但是我没能通过这部分:
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward.D")
plot(h, labels = titles, sub = "")
我尝试直接 运行 "hclust",然后我能够绘制,但没有任何可读的结果。
这是我遇到的错误:
rownames(docsdissim2) <- titles
Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral", :
length of 'dimnames' [1] not equal to array extent
另一个:
plot(h, labels = titles, sub = "")
Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
invalid dendrogram input
有没有人可以帮我完成这个例子?
此致,
感谢 Norbert Ryciak(本教程的作者),我得以解决这个问题。
因为他使用的是旧版本的"tm"(可能是当时最新的),所以与我使用的版本不兼容。
解决方案是将 "docsTDM <- TermDocumentMatrix(docs8)" 替换为 "docsTDM <- DocumentTermMatrix(docs8)"。
所以最终代码:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- DocumentTermMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward")
plot(h, labels = titles, sub = "")