提取文档的主题分数 LDA Gensim Python
Extract Topic Scores for Documents LDA Gensim Python
我正在尝试在使用 LDA 模型后为我的数据集中的文档提取主题分数。具体来说,我遵循了这里的大部分代码:https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
我已经完成了主题模型并得到了我想要的结果,但是提供的代码只给出了每个文档最主要的主题。有没有一种简单的方法可以修改以下代码,为我提供 5 个最主要主题的分数?
##dominant topic for each document
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(10)
对,这是一个粗鲁的例子,因为你没有提供数据来重现,但我们可以使用一些 gensim 测试语料库、文本和字典来做:
from gensim.test.utils import common_texts, common_corpus, common_dictionary
from gensim.models import LdaModel
# train a quick lda model using the common _corpus, _dictionary and _texts from gensim
optimal_model = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)
然后我们可以稍微重写函数成为:
import pandas as pd
##dominant topic for each document
def format_topics_sentences(ldamodel=optimal_model,
corpus=common_corpus,
texts=common_texts,
n=1):
"""
A function for extracting a number of dominant topics for a given document
using an existing LDA model
"""
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
# we use range here to iterate over the n parameter
if j in range(n): # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(
# and also use the i value here to get the document label
pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords]),
ignore_index=True,
)
else:
break
sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]
# Add original text to the end of the output
text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
contents = pd.Series(text_col, name='original_texts')
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return sent_topics_df
然后我们可以这样使用函数:
format_topics_sentences(ldamodel=optimal_model, corpus=common_corpus, texts=common_texts, n=2)
其中 n
参数指定要提取的主导主题的数量。
我正在尝试在使用 LDA 模型后为我的数据集中的文档提取主题分数。具体来说,我遵循了这里的大部分代码:https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
我已经完成了主题模型并得到了我想要的结果,但是提供的代码只给出了每个文档最主要的主题。有没有一种简单的方法可以修改以下代码,为我提供 5 个最主要主题的分数?
##dominant topic for each document
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(10)
对,这是一个粗鲁的例子,因为你没有提供数据来重现,但我们可以使用一些 gensim 测试语料库、文本和字典来做:
from gensim.test.utils import common_texts, common_corpus, common_dictionary
from gensim.models import LdaModel
# train a quick lda model using the common _corpus, _dictionary and _texts from gensim
optimal_model = LdaModel(common_corpus, id2word=common_dictionary, num_topics=10)
然后我们可以稍微重写函数成为:
import pandas as pd
##dominant topic for each document
def format_topics_sentences(ldamodel=optimal_model,
corpus=common_corpus,
texts=common_texts,
n=1):
"""
A function for extracting a number of dominant topics for a given document
using an existing LDA model
"""
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
# we use range here to iterate over the n parameter
if j in range(n): # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(
# and also use the i value here to get the document label
pd.Series([int(i), int(topic_num), round(prop_topic, 4), topic_keywords]),
ignore_index=True,
)
else:
break
sent_topics_df.columns = ["Document", "Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]
# Add original text to the end of the output
text_col = [texts[int(i)] for i in sent_topics_df.Document.tolist()]
contents = pd.Series(text_col, name='original_texts')
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return sent_topics_df
然后我们可以这样使用函数:
format_topics_sentences(ldamodel=optimal_model, corpus=common_corpus, texts=common_texts, n=2)
其中 n
参数指定要提取的主导主题的数量。