绘制 LDA 主题随时间的演变
Plot the evolution of an LDA topic across time
我想绘制特定主题的比例随时间变化的情况,但我在隔离单个主题并随时间绘制时遇到了一些麻烦,尤其是分别绘制多组文档时(让我们创建要比较的两组 - 期刊 A 和 B)。我在一个名为 dateConverter
.
的函数中保存了与这些期刊相关的日期
这是我目前所拥有的(非常感谢@scoa):
library(tm); library(topicmodels);
txtfolder <- "~/path/to/documents/"
source <- DirSource(txtfolder)
myCorpus <- Corpus(source, readerControl=list(reader=readPlain))
for (i in 1:10){
meta(myCorpus[[i]], tag = "origin") <- "A"
}
for (i in 11:length(myCorpus)){
meta(myCorpus[[i]], tag = "origin") <- "B"
}
dates <- do.call("c", dateConverter)
for (i in 1:length(myCorpus)){
meta(myCorpus[[i]], tag = "datetimestamp") <- dates[i]
}
dtm <- DocumentTermMatrix(myCorpus, control = list(minWordLength=3))
n.topics <- 10
lda.model <- LDA(dtm, n.topics)
terms(lda.model,10)
df <- data.frame(id=names(topics(lda.model)),
topic=posterior(lda.model),
date=as.POSIXct(unlist(lapply(meta(myCorpus,type="local",tag="datetimestamp"),as.character))),
origin=unlist(meta(myCorpus,type="local",tag="origin")) )
如何绘制这些?
这只是我之前回答的改编:
## Load the data
library(tm)
## Use built-in data set
data(acq)
myCorpus <- acq
## prepare the data
for (i in 1:25){
meta(myCorpus[[i]], tag = "origin") <- "A"
}
for (i in 26:length(myCorpus)){
meta(myCorpus[[i]], tag = "origin") <- "B"
}
dates <- sample(seq.Date(as.Date("2013-01-01"),as.Date("2014-01-01"),length.out=8),50, replace=TRUE)
for (i in 1:length(myCorpus)){
meta(myCorpus[[i]], tag = "datetimestamp") <- dates[i]
}
dtm <- DocumentTermMatrix(myCorpus, control = list(minWordLength=3))
library(topicmodels)
n.topics <- 5
lda.model <- LDA(dtm, n.topics)
terms(lda.model,10)
重塑数据以进行绘图。我对每组主题、日期和来源取平均后验。
df <- data.frame(id=names(topics(lda.model)),
date=as.POSIXct(unlist(lapply(meta(myCorpus,type="local",tag="datetimestamp"),as.character))),
origin=unlist(meta(myCorpus,type="local",tag="origin")) )
dft <- cbind(df,posterior(lda.model)$topics)
library(dplyr)
library(tidyr)
M <- gather(dft,topic,value,-id,-date,-origin) %>%
group_by(topic,date,origin) %>%
summarize(value=mean(value))
情节
library(ggplot2)
ggplot(M,aes(x=date,color=origin,y=value)) +
geom_point() +
geom_line() +
facet_grid(topic~origin)
我想绘制特定主题的比例随时间变化的情况,但我在隔离单个主题并随时间绘制时遇到了一些麻烦,尤其是分别绘制多组文档时(让我们创建要比较的两组 - 期刊 A 和 B)。我在一个名为 dateConverter
.
这是我目前所拥有的(非常感谢@scoa):
library(tm); library(topicmodels);
txtfolder <- "~/path/to/documents/"
source <- DirSource(txtfolder)
myCorpus <- Corpus(source, readerControl=list(reader=readPlain))
for (i in 1:10){
meta(myCorpus[[i]], tag = "origin") <- "A"
}
for (i in 11:length(myCorpus)){
meta(myCorpus[[i]], tag = "origin") <- "B"
}
dates <- do.call("c", dateConverter)
for (i in 1:length(myCorpus)){
meta(myCorpus[[i]], tag = "datetimestamp") <- dates[i]
}
dtm <- DocumentTermMatrix(myCorpus, control = list(minWordLength=3))
n.topics <- 10
lda.model <- LDA(dtm, n.topics)
terms(lda.model,10)
df <- data.frame(id=names(topics(lda.model)),
topic=posterior(lda.model),
date=as.POSIXct(unlist(lapply(meta(myCorpus,type="local",tag="datetimestamp"),as.character))),
origin=unlist(meta(myCorpus,type="local",tag="origin")) )
如何绘制这些?
这只是我之前回答的改编:
## Load the data
library(tm)
## Use built-in data set
data(acq)
myCorpus <- acq
## prepare the data
for (i in 1:25){
meta(myCorpus[[i]], tag = "origin") <- "A"
}
for (i in 26:length(myCorpus)){
meta(myCorpus[[i]], tag = "origin") <- "B"
}
dates <- sample(seq.Date(as.Date("2013-01-01"),as.Date("2014-01-01"),length.out=8),50, replace=TRUE)
for (i in 1:length(myCorpus)){
meta(myCorpus[[i]], tag = "datetimestamp") <- dates[i]
}
dtm <- DocumentTermMatrix(myCorpus, control = list(minWordLength=3))
library(topicmodels)
n.topics <- 5
lda.model <- LDA(dtm, n.topics)
terms(lda.model,10)
重塑数据以进行绘图。我对每组主题、日期和来源取平均后验。
df <- data.frame(id=names(topics(lda.model)),
date=as.POSIXct(unlist(lapply(meta(myCorpus,type="local",tag="datetimestamp"),as.character))),
origin=unlist(meta(myCorpus,type="local",tag="origin")) )
dft <- cbind(df,posterior(lda.model)$topics)
library(dplyr)
library(tidyr)
M <- gather(dft,topic,value,-id,-date,-origin) %>%
group_by(topic,date,origin) %>%
summarize(value=mean(value))
情节
library(ggplot2)
ggplot(M,aes(x=date,color=origin,y=value)) +
geom_point() +
geom_line() +
facet_grid(topic~origin)