Python 中的 LDA,我得到的是字符而不是主题
LDA in Python, I get characters not topics
我对在Python中执行LDA有点困惑。
我有一个文档文件,我想 运行 LDA 并获取主题。
import docx
import nltk
import gensim
from gensim.models import hdpmodel, ldamodel
from gensim import corpora
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
fullText=getText('ElizabethII.docx')
#create lda object
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in fullText]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word)
== 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, passes=15)
topics = lda.show_topics(num_words=4)
for topic in topics:
print(topic)
corpus_lda = lda[corpus]
print(lda.show_topics())
结果我得到了这个:
(0, '0.723*"r" + 0.211*"f" + 0.025*"5" + 0.013*"-"')
(1, '0.410*"e" + 0.258*"t" + 0.206*"h" + 0.068*"m"')
(2, '0.319*"n" + 0.162*"l" + 0.113*"c" + 0.101*"u"')
(3, '0.503*"i" + 0.324*"d" + 0.113*"b" + 0.041*"9"')
(4, '0.355*"o" + 0.307*"s" + 0.106*"w" + 0.052*"v"')
这让我很困惑。为什么我得到的是字符而不是主题?是因为我的 docx 文件(包含 1900 个单词?)还是代码错误?或者我应该为句子(段落)提供主题? (怎么样?)
使用word_tokenize如下所示准备由单词组成的标记:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import ldamodel
from gensim import corpora
stop_words = set(stopwords.words('english'))
Doc1 = 'Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data in various forms, both structured and unstructured,[1][2] similar to data mining. '
Doc2 = 'Biology is the natural science that studies life and living organisms, including their physical structure, chemical processes, molecular interactions, physiological mechanisms, development and evolution.'
docs = [Doc1, Doc2]
tokens = [word_tokenize(doc.lower()) for doc in docs]
tokens = [[t for t in token_list if t not in stop_words] for token_list in tokens]
word_l = WordNetLemmatizer()
tokens = [[word_l.lemmatize(t) for t in token_list if t.isalpha()] for token_list in tokens]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(t) for t in tokens]
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, passes=15)
topics = lda.show_topics(num_words=4)
for topic in topics:
print(topic)
(0, '0.029*"science" + 0.029*"process" + 0.029*"living" + 0.029*"organism"')
(1, '0.114*"data" + 0.043*"scientific" + 0.043*"similar" + 0.043*"mining"')
(2, '0.029*"science" + 0.029*"process" + 0.029*"living" + 0.029*"biology"')
(3, '0.029*"process" + 0.029*"science" + 0.029*"living" + 0.029*"biology"')
(4, '0.048*"process" + 0.048*"science" + 0.048*"evolution" + 0.048*"physical"')
我对在Python中执行LDA有点困惑。 我有一个文档文件,我想 运行 LDA 并获取主题。
import docx
import nltk
import gensim
from gensim.models import hdpmodel, ldamodel
from gensim import corpora
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
fullText=getText('ElizabethII.docx')
#create lda object
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in fullText]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word)
== 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, passes=15)
topics = lda.show_topics(num_words=4)
for topic in topics:
print(topic)
corpus_lda = lda[corpus]
print(lda.show_topics())
结果我得到了这个:
(0, '0.723*"r" + 0.211*"f" + 0.025*"5" + 0.013*"-"')
(1, '0.410*"e" + 0.258*"t" + 0.206*"h" + 0.068*"m"')
(2, '0.319*"n" + 0.162*"l" + 0.113*"c" + 0.101*"u"')
(3, '0.503*"i" + 0.324*"d" + 0.113*"b" + 0.041*"9"')
(4, '0.355*"o" + 0.307*"s" + 0.106*"w" + 0.052*"v"')
这让我很困惑。为什么我得到的是字符而不是主题?是因为我的 docx 文件(包含 1900 个单词?)还是代码错误?或者我应该为句子(段落)提供主题? (怎么样?)
使用word_tokenize如下所示准备由单词组成的标记:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import ldamodel
from gensim import corpora
stop_words = set(stopwords.words('english'))
Doc1 = 'Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from data in various forms, both structured and unstructured,[1][2] similar to data mining. '
Doc2 = 'Biology is the natural science that studies life and living organisms, including their physical structure, chemical processes, molecular interactions, physiological mechanisms, development and evolution.'
docs = [Doc1, Doc2]
tokens = [word_tokenize(doc.lower()) for doc in docs]
tokens = [[t for t in token_list if t not in stop_words] for token_list in tokens]
word_l = WordNetLemmatizer()
tokens = [[word_l.lemmatize(t) for t in token_list if t.isalpha()] for token_list in tokens]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(t) for t in tokens]
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=5, passes=15)
topics = lda.show_topics(num_words=4)
for topic in topics:
print(topic)
(0, '0.029*"science" + 0.029*"process" + 0.029*"living" + 0.029*"organism"')
(1, '0.114*"data" + 0.043*"scientific" + 0.043*"similar" + 0.043*"mining"')
(2, '0.029*"science" + 0.029*"process" + 0.029*"living" + 0.029*"biology"')
(3, '0.029*"process" + 0.029*"science" + 0.029*"living" + 0.029*"biology"')
(4, '0.048*"process" + 0.048*"science" + 0.048*"evolution" + 0.048*"physical"')