R - 维基百科文章的自动分类

Question

我一直在尝试关注 Norbert Ryciak 的 example，但我一直无法联系到他。

自从这篇文章写于 2014 年以来，R 中的一些东西发生了变化，所以我能够更新代码中的一些东西，但我卡在了最后一部分。

到目前为止，这是我的工作代码：

 library(tm)
 library(stringi)
 library(proxy)

 wiki <- "https://en.wikipedia.org/wiki/"

 titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral",  "Derivative",
  "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
  "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")

 articles <- character(length(titles))

 for (i in 1:length(titles)) {
   articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
  }

 docs <- Corpus(VectorSource(articles))

 docs[[1]]
 docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
 docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
 docs4 <- tm_map(docs3, PlainTextDocument)
 docs5 <- tm_map(docs4, stripWhitespace)
 docs6 <- tm_map(docs5, removeWords, stopwords("english"))
 docs7 <- tm_map(docs6, removePunctuation)
 docs8 <- tm_map(docs7, content_transformer(tolower))
 docs8[[1]]

 docsTDM <- TermDocumentMatrix(docs8)
 docsTDM2 <- as.matrix(docsTDM)
 docsdissim <- dist(docsTDM2, method = "cosine")

但是我没能通过这部分：

 docsdissim2 <- as.matrix(docsdissim)
 rownames(docsdissim2) <- titles
 colnames(docsdissim2) <- titles
 docsdissim2
 h <- hclust(docsdissim, method = "ward.D")
 plot(h, labels = titles, sub = "")

我尝试直接运行 "hclust"，然后我能够绘制，但没有任何可读的结果。

这是我遇到的错误：

 rownames(docsdissim2) <- titles
 Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral",  : 
   length of 'dimnames' [1] not equal to array extent

另一个：

 plot(h, labels = titles, sub = "")
 Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang,  : 
   invalid dendrogram input

有没有人可以帮我完成这个例子？

此致，

Answer 1

感谢 Norbert Ryciak（本教程的作者），我得以解决这个问题。

因为他使用的是旧版本的"tm"（可能是当时最新的），所以与我使用的版本不兼容。

解决方案是将 "docsTDM <- TermDocumentMatrix(docs8)" 替换为 "docsTDM <- DocumentTermMatrix(docs8)"。

所以最终代码：

 library(tm)
 library(stringi)
 library(proxy)

 wiki <- "https://en.wikipedia.org/wiki/"

 titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral",  "Derivative",
  "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
  "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")

 articles <- character(length(titles))

 for (i in 1:length(titles)) {
   articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col =     " ")
  }

 docs <- Corpus(VectorSource(articles))

 docs[[1]]
 docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
 docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
 docs4 <- tm_map(docs3, PlainTextDocument)
 docs5 <- tm_map(docs4, stripWhitespace)
 docs6 <- tm_map(docs5, removeWords, stopwords("english"))
 docs7 <- tm_map(docs6, removePunctuation)
 docs8 <- tm_map(docs7, content_transformer(tolower))
 docs8[[1]]

 docsTDM <- DocumentTermMatrix(docs8)
 docsTDM2 <- as.matrix(docsTDM)
 docsdissim <- dist(docsTDM2, method = "cosine")

 docsdissim2 <- as.matrix(docsdissim)
 rownames(docsdissim2) <- titles
 colnames(docsdissim2) <- titles
 docsdissim2
 h <- hclust(docsdissim, method = "ward")
 plot(h, labels = titles, sub = "")

R - 维基百科文章的自动分类

R - Automatic categorization of Wikipedia articles

r

text-classification