无法使用 LDA 训练模型对主题进行分类
Unable to classify topics using LDA trained model
我使用 Gensim 创建了一个 LDA 模型,我首先在 3 到 10 的范围内从 num_topics 迭代,并基于 pyLDAvis 图,在最终的 lda 模型中选择 n = 3。
import glob
import sys
sys.path.append('/Users/tcssig/Documents/NLP_code_base/Doc_Similarity')
import normalization
from gensim.models.coherencemodel import CoherenceModel
datalist = []
for filename in glob.iglob('/Users/tcssig/Documents/Speech_text_files/*.*'):
text = open(filename).readlines()
text = normalization.normalize_corpus(text, only_text_chars=True, tokenize=True)
datalist.append(text)
datalist = [datalist[i][0] for i in range(len(datalist))]
from gensim import models,corpora
import spacy
dictionary = corpora.Dictionary(datalist)
num_topics = 3
Lda = models.LdaMulticore
#lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=2000,random_state=3)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in datalist]
dictionary = corpora.Dictionary(datalist)
import numpy as np
import pandas as pd
import spacy
import re
from tqdm._tqdm_notebook import tqdm_notebook,tnrange,tqdm
from collections import Counter,OrderedDict
from gensim import models,corpora
from gensim.summarization import summarize,keywords
import warnings
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
Lda = models.LdaMulticore
coherenceList_umass = []
coherenceList_cv = []
num_topics_list = np.arange(3,10)
for num_topics in tqdm(num_topics_list):
lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=4000,random_state=43)
cm = CoherenceModel(model=lda, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
coherenceList_umass.append(cm.get_coherence())
cm_cv = CoherenceModel(model=lda, corpus=doc_term_matrix, texts=datalist, dictionary=dictionary, coherence='c_v')
coherenceList_cv.append(cm_cv.get_coherence())
vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)
pyLDAvis.save_html(vis,'pyLDAvis_%d.html' %num_topics)
plotData = pd.DataFrame({'Number of topics':num_topics_list,'CoherenceScore':coherenceList_umass})
f,ax = plt.subplots(figsize=(10,6))
sns.set_style("darkgrid")
sns.pointplot(x='Number of topics',y= 'CoherenceScore',data=plotData)
plt.axhline(y=-3.9)
plt.title('Topic coherence')
plt.savefig('Topic coherence plot.png')
#################################################################
#################################################################
lda_final= Lda(doc_term_matrix, num_topics=3,id2word = dictionary, passes=20,chunksize=4000,random_state=43)
lda_final.save('lda_final')
dictionary.save('dictionary')
corpora.MmCorpus.serialize('doc_term_matrix.mm', doc_term_matrix)
a = lda_final.show_topics(num_topics=3,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10)
topic2wordb = {}
topic2csb = {}
topic2worda = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1
for ws in b:
wset = set(w[1] for w in ws[0])
topic2wordb[cnt] = wset
topic2csb[cnt] = ws[1]
cnt +=1
for ws in a:
wset = set(w[0]for w in ws[1])
topic2worda[ws[0]+1] = wset
for i in range(1,num_topics+1):
for j in range(1,num_topics+1):
if topic2worda[i].intersection(topic2wordb[j])==topic2worda[i]:
topic2csa[i] = topic2csb[j]
print('the final data block')
finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2worda.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2worda.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
print(finalData)
现在我有了经过训练的模型,但我想知道如何在用于训练的文档以及新的未见过的文档上使用该模型来分配主题
我正在使用下面的代码来执行此操作,但出现如下错误:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
text = normalization.normalize_corpus(unseen_document, only_text_chars=True, tokenize=True)
bow_vector = dictionary.doc2bow(text)
corpora.MmCorpus.serialize('x.bow_vector', bow_vector)
corpus = [dictionary.doc2bow(text)]
x = lda_final[corpus]
错误信息:
Topic words cs
2 Topic3 {senator, people, power, home, year, believe, ... -0.175486
1 Topic2 {friend, place, love, play, general, house, ye... -0.318839
0 Topic1 {money, doe, fucking, play, love, people, worl... -1.360688
Traceback (most recent call last):
File "LDA_test.py", line 141, in <module>
corpus = [dictionary.doc2bow(text)]
File "/Users/tcssig/anaconda/lib/python3.5/site-packages/gensim/corpora/dictionary.py", line 250, in doc2bow
counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
TypeError: coercing to str: need a bytes-like object, list found
在这一行
corpus = [dictionary.doc2bow(text)]
您正在创建 BOW 向量列表。您需要查找这些向量,而不是列表,例如
for v in corpus:
print(lda_final[v])
将显示文档的主题概率分布。
我使用 Gensim 创建了一个 LDA 模型,我首先在 3 到 10 的范围内从 num_topics 迭代,并基于 pyLDAvis 图,在最终的 lda 模型中选择 n = 3。
import glob
import sys
sys.path.append('/Users/tcssig/Documents/NLP_code_base/Doc_Similarity')
import normalization
from gensim.models.coherencemodel import CoherenceModel
datalist = []
for filename in glob.iglob('/Users/tcssig/Documents/Speech_text_files/*.*'):
text = open(filename).readlines()
text = normalization.normalize_corpus(text, only_text_chars=True, tokenize=True)
datalist.append(text)
datalist = [datalist[i][0] for i in range(len(datalist))]
from gensim import models,corpora
import spacy
dictionary = corpora.Dictionary(datalist)
num_topics = 3
Lda = models.LdaMulticore
#lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=2000,random_state=3)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in datalist]
dictionary = corpora.Dictionary(datalist)
import numpy as np
import pandas as pd
import spacy
import re
from tqdm._tqdm_notebook import tqdm_notebook,tnrange,tqdm
from collections import Counter,OrderedDict
from gensim import models,corpora
from gensim.summarization import summarize,keywords
import warnings
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
Lda = models.LdaMulticore
coherenceList_umass = []
coherenceList_cv = []
num_topics_list = np.arange(3,10)
for num_topics in tqdm(num_topics_list):
lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=4000,random_state=43)
cm = CoherenceModel(model=lda, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
coherenceList_umass.append(cm.get_coherence())
cm_cv = CoherenceModel(model=lda, corpus=doc_term_matrix, texts=datalist, dictionary=dictionary, coherence='c_v')
coherenceList_cv.append(cm_cv.get_coherence())
vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)
pyLDAvis.save_html(vis,'pyLDAvis_%d.html' %num_topics)
plotData = pd.DataFrame({'Number of topics':num_topics_list,'CoherenceScore':coherenceList_umass})
f,ax = plt.subplots(figsize=(10,6))
sns.set_style("darkgrid")
sns.pointplot(x='Number of topics',y= 'CoherenceScore',data=plotData)
plt.axhline(y=-3.9)
plt.title('Topic coherence')
plt.savefig('Topic coherence plot.png')
#################################################################
#################################################################
lda_final= Lda(doc_term_matrix, num_topics=3,id2word = dictionary, passes=20,chunksize=4000,random_state=43)
lda_final.save('lda_final')
dictionary.save('dictionary')
corpora.MmCorpus.serialize('doc_term_matrix.mm', doc_term_matrix)
a = lda_final.show_topics(num_topics=3,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10)
topic2wordb = {}
topic2csb = {}
topic2worda = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1
for ws in b:
wset = set(w[1] for w in ws[0])
topic2wordb[cnt] = wset
topic2csb[cnt] = ws[1]
cnt +=1
for ws in a:
wset = set(w[0]for w in ws[1])
topic2worda[ws[0]+1] = wset
for i in range(1,num_topics+1):
for j in range(1,num_topics+1):
if topic2worda[i].intersection(topic2wordb[j])==topic2worda[i]:
topic2csa[i] = topic2csb[j]
print('the final data block')
finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2worda.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2worda.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
print(finalData)
现在我有了经过训练的模型,但我想知道如何在用于训练的文档以及新的未见过的文档上使用该模型来分配主题
我正在使用下面的代码来执行此操作,但出现如下错误:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
text = normalization.normalize_corpus(unseen_document, only_text_chars=True, tokenize=True)
bow_vector = dictionary.doc2bow(text)
corpora.MmCorpus.serialize('x.bow_vector', bow_vector)
corpus = [dictionary.doc2bow(text)]
x = lda_final[corpus]
错误信息:
Topic words cs
2 Topic3 {senator, people, power, home, year, believe, ... -0.175486
1 Topic2 {friend, place, love, play, general, house, ye... -0.318839
0 Topic1 {money, doe, fucking, play, love, people, worl... -1.360688
Traceback (most recent call last):
File "LDA_test.py", line 141, in <module>
corpus = [dictionary.doc2bow(text)]
File "/Users/tcssig/anaconda/lib/python3.5/site-packages/gensim/corpora/dictionary.py", line 250, in doc2bow
counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
TypeError: coercing to str: need a bytes-like object, list found
在这一行
corpus = [dictionary.doc2bow(text)]
您正在创建 BOW 向量列表。您需要查找这些向量,而不是列表,例如
for v in corpus:
print(lda_final[v])
将显示文档的主题概率分布。