无法使用 LDA 训练模型对主题进行分类

Unable to classify topics using LDA trained model

我使用 Gensim 创建了一个 LDA 模型,我首先在 3 到 10 的范围内从 num_topics 迭代,并基于 pyLDAvis 图,在最终的 lda 模型中选择 n = 3。

import glob
import sys
sys.path.append('/Users/tcssig/Documents/NLP_code_base/Doc_Similarity')
import normalization
from gensim.models.coherencemodel import CoherenceModel
datalist = []

for filename in glob.iglob('/Users/tcssig/Documents/Speech_text_files/*.*'):
    text = open(filename).readlines()
    text = normalization.normalize_corpus(text, only_text_chars=True, tokenize=True)
    datalist.append(text)

datalist = [datalist[i][0] for i in range(len(datalist))]

from gensim import models,corpora
import spacy
dictionary = corpora.Dictionary(datalist)
num_topics = 3
Lda = models.LdaMulticore

#lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=2000,random_state=3)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in datalist]

dictionary = corpora.Dictionary(datalist)
import numpy as np 
import pandas as pd
import spacy
import re
from tqdm._tqdm_notebook import tqdm_notebook,tnrange,tqdm
from collections import Counter,OrderedDict
from gensim import models,corpora
from gensim.summarization import summarize,keywords
import warnings
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns

Lda = models.LdaMulticore
coherenceList_umass = []
coherenceList_cv = []
num_topics_list = np.arange(3,10)
for num_topics in tqdm(num_topics_list):
    lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, passes=20,chunksize=4000,random_state=43)
    cm = CoherenceModel(model=lda, corpus=doc_term_matrix, dictionary=dictionary, coherence='u_mass')
    coherenceList_umass.append(cm.get_coherence())
    cm_cv = CoherenceModel(model=lda, corpus=doc_term_matrix, texts=datalist, dictionary=dictionary, coherence='c_v')
    coherenceList_cv.append(cm_cv.get_coherence())
    vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)
    pyLDAvis.save_html(vis,'pyLDAvis_%d.html' %num_topics)


plotData = pd.DataFrame({'Number of topics':num_topics_list,'CoherenceScore':coherenceList_umass})
f,ax = plt.subplots(figsize=(10,6))
sns.set_style("darkgrid")
sns.pointplot(x='Number of topics',y= 'CoherenceScore',data=plotData)
plt.axhline(y=-3.9)
plt.title('Topic coherence')
plt.savefig('Topic coherence plot.png')

#################################################################
#################################################################

lda_final= Lda(doc_term_matrix, num_topics=3,id2word = dictionary, passes=20,chunksize=4000,random_state=43)

lda_final.save('lda_final')

dictionary.save('dictionary')

corpora.MmCorpus.serialize('doc_term_matrix.mm', doc_term_matrix)


a = lda_final.show_topics(num_topics=3,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10)


topic2wordb = {}
topic2csb = {}
topic2worda = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1

for ws in b:
    wset = set(w[1] for w in ws[0])
    topic2wordb[cnt] = wset
    topic2csb[cnt] = ws[1]
    cnt +=1

for ws in a:
    wset = set(w[0]for w in ws[1])
    topic2worda[ws[0]+1] = wset

for i in range(1,num_topics+1):
    for j in range(1,num_topics+1):  
        if topic2worda[i].intersection(topic2wordb[j])==topic2worda[i]:
            topic2csa[i] = topic2csb[j]

print('the final data block')
finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2worda.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2worda.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
print(finalData)

现在我有了经过训练的模型,但我想知道如何在用于训练的文档以及新的未见过的文档上使用该模型来分配主题

我正在使用下面的代码来执行此操作,但出现如下错误:

unseen_document = 'How a Pentagon deal became an identity crisis for Google'

text = normalization.normalize_corpus(unseen_document, only_text_chars=True, tokenize=True)

bow_vector = dictionary.doc2bow(text)

corpora.MmCorpus.serialize('x.bow_vector', bow_vector)

corpus = [dictionary.doc2bow(text)]

x = lda_final[corpus]

错误信息:

    Topic                                              words        cs
2  Topic3  {senator, people, power, home, year, believe, ... -0.175486
1  Topic2  {friend, place, love, play, general, house, ye... -0.318839
0  Topic1  {money, doe, fucking, play, love, people, worl... -1.360688

Traceback (most recent call last):
  File "LDA_test.py", line 141, in <module>
    corpus = [dictionary.doc2bow(text)]
  File "/Users/tcssig/anaconda/lib/python3.5/site-packages/gensim/corpora/dictionary.py", line 250, in doc2bow
    counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
TypeError: coercing to str: need a bytes-like object, list found

在这一行

corpus = [dictionary.doc2bow(text)]

您正在创建 BOW 向量列表。您需要查找这些向量,而不是列表,例如

for v in corpus:
    print(lda_final[v])

将显示文档的主题概率分布。

gensim docs