R - 维基百科文章的自动分类

R - Automatic categorization of Wikipedia articles

我一直在尝试关注 Norbert Ryciak 的 example,但我一直无法联系到他。

自从这篇文章写于 2014 年以来,R 中的一些东西发生了变化,所以我能够更新代码中的一些东西,但我卡在了最后一部分。

到目前为止,这是我的工作代码:

 library(tm)
 library(stringi)
 library(proxy)

 wiki <- "https://en.wikipedia.org/wiki/"

 titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral",  "Derivative",
  "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
  "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")

 articles <- character(length(titles))

 for (i in 1:length(titles)) {
   articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
  }

 docs <- Corpus(VectorSource(articles))

 docs[[1]]
 docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
 docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
 docs4 <- tm_map(docs3, PlainTextDocument)
 docs5 <- tm_map(docs4, stripWhitespace)
 docs6 <- tm_map(docs5, removeWords, stopwords("english"))
 docs7 <- tm_map(docs6, removePunctuation)
 docs8 <- tm_map(docs7, content_transformer(tolower))
 docs8[[1]]

 docsTDM <- TermDocumentMatrix(docs8)
 docsTDM2 <- as.matrix(docsTDM)
 docsdissim <- dist(docsTDM2, method = "cosine")

但是我没能通过这部分:

 docsdissim2 <- as.matrix(docsdissim)
 rownames(docsdissim2) <- titles
 colnames(docsdissim2) <- titles
 docsdissim2
 h <- hclust(docsdissim, method = "ward.D")
 plot(h, labels = titles, sub = "")

我尝试直接 运行 "hclust",然后我能够绘制,但没有任何可读的结果。

这是我遇到的错误:

 rownames(docsdissim2) <- titles
 Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral",  : 
   length of 'dimnames' [1] not equal to array extent

另一个:

 plot(h, labels = titles, sub = "")
 Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang,  : 
   invalid dendrogram input

有没有人可以帮我完成这个例子?

此致,

感谢 Norbert Ryciak(本教程的作者),我得以解决这个问题。

因为他使用的是旧版本的"tm"(可能是当时最新的),所以与我使用的版本不兼容。

解决方案是将 "docsTDM <- TermDocumentMatrix(docs8)" 替换为 "docsTDM <- DocumentTermMatrix(docs8)"。

所以最终代码:

 library(tm)
 library(stringi)
 library(proxy)

 wiki <- "https://en.wikipedia.org/wiki/"

 titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral",  "Derivative",
  "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
  "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")

 articles <- character(length(titles))

 for (i in 1:length(titles)) {
   articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col =     " ")
  }

 docs <- Corpus(VectorSource(articles))

 docs[[1]]
 docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
 docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
 docs4 <- tm_map(docs3, PlainTextDocument)
 docs5 <- tm_map(docs4, stripWhitespace)
 docs6 <- tm_map(docs5, removeWords, stopwords("english"))
 docs7 <- tm_map(docs6, removePunctuation)
 docs8 <- tm_map(docs7, content_transformer(tolower))
 docs8[[1]]

 docsTDM <- DocumentTermMatrix(docs8)
 docsTDM2 <- as.matrix(docsTDM)
 docsdissim <- dist(docsTDM2, method = "cosine")

 docsdissim2 <- as.matrix(docsdissim)
 rownames(docsdissim2) <- titles
 colnames(docsdissim2) <- titles
 docsdissim2
 h <- hclust(docsdissim, method = "ward")
 plot(h, labels = titles, sub = "")