如何将 tm_map() 输出保存到 csv 文件?
How to save tm_map() output to csv file?
我正在分析来自 mashable.com 的新文章。我创建的数据看起来像(目前有14篇文章,因子很受欢迎或not_popular)
id内容因子
1 一些文字数据流行
我想使用 Jonathan Chang 的 LDA 包对此数据进行监督主题建模。我尝试对数据进行一些预处理,这里是相同的脚本
require("ggplot2")
require("grid")
require("plyr")
library(reshape)
library(ScottKnott)
setwd("~/Desktop")
library(lda)
library(tm)
dataValues<- read.csv('Business.csv')
dim(dataValues)
## Text Pre-processing.
## Creating a Corpus from the Orginal Function
## interprets each element of the vector x as a document
CorpusObj<- VectorSource(dataValues$content);
CorpusObj<-Corpus(CorpusObj);
# remove \r and \n
remove.carrigae <- function(x) gsub("[\r\n]", "", x)
CorpusObj = tm_map(CorpusObj,remove.carrigae)
#remove Hyperlinks
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
CorpusObj <- tm_map(CorpusObj, removeURL)
#remove special char
removeSPE <- function(x) gsub("[^a-zA-Z0-9]", " ", x)
CorpusObj <- tm_map(CorpusObj, removeSPE)
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)
#CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") #Stemming the words
CorpusObj<-tm_map(CorpusObj,stripWhitespace)
#CorpusObj <- tm_map(CorpusObj, tolower) # convert all text to lower case
inspect(CorpusObj[14])
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
#save in indiv text file
writeCorpus(CorpusObj, path = "~/Desktop/untitled_folder")
#write 1 file
writeLines(as.character(CorpusObj), con="mycorpus.txt")
inspect(CorpusObj[14])
我想保存
的输出
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
到一个 .csv 文件并希望每一行(单元格)是 1 个文档
函数 writeCorpus(CorpusObj, path = "~/Desktop/untitled_folder")
正在将最后一个文档写入文本文件。
当我尝试使用函数时 corpusLDA <- lexicalize(CorpusObj )
在 PlaintextDocument 之后我得到以下输出 It has all the docs in the [1:2,1:6007] and the other 2 list are empty
请指导我哪里出错了。谢谢。
当我检查此脚本创建的 .txt
文件时,我看到了所有不同的文档。然而,它们采用人类不友好的格式。
这是我认为你想要的:
pacman::p_load("ggplot2", grid, plyr, reshape, ScottKnott, lda,tm)
dataValues <- read.csv("business.csv")
dim(dataValues)
## Text Pre-processing.
## Creating a Corpus from the Orginal Function
## interprets each element of the vector x as a document
CorpusObj<- VectorSource(dataValues$content);
CorpusObj<-Corpus(CorpusObj);
# remove \r and \n
remove.carrigae <- function(x) gsub("[\r\n]", "", x)
CorpusObj = tm_map(CorpusObj,remove.carrigae)
#remove Hyperlinks
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
CorpusObj <- tm_map(CorpusObj, removeURL)
#remove special char
removeSPE <- function(x) gsub("[^a-zA-Z0-9]", " ", x)
CorpusObj <- tm_map(CorpusObj, removeSPE)
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)
#CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") #Stemming the words
CorpusObj<-tm_map(CorpusObj,stripWhitespace)
#CorpusObj <- tm_map(CorpusObj, tolower) # convert all text to lower case
inspect(CorpusObj[14])
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
#save in indiv text file
writeCorpus(CorpusObj)
#write 1 file
tmp <- CorpusObj[1]
dataframe<-data.frame(text=unlist(sapply(CorpusObj, `[`, "content")), stringsAsFactors=F)
write.csv(dataframe, "output.csv")
我正在分析来自 mashable.com 的新文章。我创建的数据看起来像(目前有14篇文章,因子很受欢迎或not_popular)
id内容因子
1 一些文字数据流行
我想使用 Jonathan Chang 的 LDA 包对此数据进行监督主题建模。我尝试对数据进行一些预处理,这里是相同的脚本
require("ggplot2")
require("grid")
require("plyr")
library(reshape)
library(ScottKnott)
setwd("~/Desktop")
library(lda)
library(tm)
dataValues<- read.csv('Business.csv')
dim(dataValues)
## Text Pre-processing.
## Creating a Corpus from the Orginal Function
## interprets each element of the vector x as a document
CorpusObj<- VectorSource(dataValues$content);
CorpusObj<-Corpus(CorpusObj);
# remove \r and \n
remove.carrigae <- function(x) gsub("[\r\n]", "", x)
CorpusObj = tm_map(CorpusObj,remove.carrigae)
#remove Hyperlinks
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
CorpusObj <- tm_map(CorpusObj, removeURL)
#remove special char
removeSPE <- function(x) gsub("[^a-zA-Z0-9]", " ", x)
CorpusObj <- tm_map(CorpusObj, removeSPE)
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)
#CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") #Stemming the words
CorpusObj<-tm_map(CorpusObj,stripWhitespace)
#CorpusObj <- tm_map(CorpusObj, tolower) # convert all text to lower case
inspect(CorpusObj[14])
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
#save in indiv text file
writeCorpus(CorpusObj, path = "~/Desktop/untitled_folder")
#write 1 file
writeLines(as.character(CorpusObj), con="mycorpus.txt")
inspect(CorpusObj[14])
我想保存
的输出CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
到一个 .csv 文件并希望每一行(单元格)是 1 个文档
函数 writeCorpus(CorpusObj, path = "~/Desktop/untitled_folder")
正在将最后一个文档写入文本文件。
当我尝试使用函数时 corpusLDA <- lexicalize(CorpusObj )
在 PlaintextDocument 之后我得到以下输出 It has all the docs in the [1:2,1:6007] and the other 2 list are empty
请指导我哪里出错了。谢谢。
当我检查此脚本创建的 .txt
文件时,我看到了所有不同的文档。然而,它们采用人类不友好的格式。
这是我认为你想要的:
pacman::p_load("ggplot2", grid, plyr, reshape, ScottKnott, lda,tm)
dataValues <- read.csv("business.csv")
dim(dataValues)
## Text Pre-processing.
## Creating a Corpus from the Orginal Function
## interprets each element of the vector x as a document
CorpusObj<- VectorSource(dataValues$content);
CorpusObj<-Corpus(CorpusObj);
# remove \r and \n
remove.carrigae <- function(x) gsub("[\r\n]", "", x)
CorpusObj = tm_map(CorpusObj,remove.carrigae)
#remove Hyperlinks
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
CorpusObj <- tm_map(CorpusObj, removeURL)
#remove special char
removeSPE <- function(x) gsub("[^a-zA-Z0-9]", " ", x)
CorpusObj <- tm_map(CorpusObj, removeSPE)
CorpusObj <- tm_map(CorpusObj, removePunctuation)
CorpusObj <- tm_map(CorpusObj, removeNumbers)
#CorpusObj <- tm_map(CorpusObj, removeWords, stopwords("english"))
CorpusObj <- tm_map(CorpusObj, stemDocument, language = "english") #Stemming the words
CorpusObj<-tm_map(CorpusObj,stripWhitespace)
#CorpusObj <- tm_map(CorpusObj, tolower) # convert all text to lower case
inspect(CorpusObj[14])
CorpusObj <- tm_map(CorpusObj, PlainTextDocument)
#save in indiv text file
writeCorpus(CorpusObj)
#write 1 file
tmp <- CorpusObj[1]
dataframe<-data.frame(text=unlist(sapply(CorpusObj, `[`, "content")), stringsAsFactors=F)
write.csv(dataframe, "output.csv")