gensim.models.Phrases 的问题
Issue with gensim.models.Phrases
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Phrases
class SentenceClass(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname,fname), 'r') as myfile:
doc = myfile.read().replace('\n', ' ')
for sent in tokenize.sent_tokenize(doc.lower()):
yield [Stemming.stem(word)\
for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
if word not in stopwords]
现在的两种方法:
1)
model = Word2Vec(SentenceClass(data_dir_path), size=100, window=5, min_count=1, workers=4)
上面的运行非常好,没有警告
2)
bigram_transformer = Phrases(SentenceClass(data_dir_path), min_count=1)
model = Word2Vec(bigram_transformer[SentenceClass(data_dir_path)], size=100, window=5, min_count=1, workers=4)
产生警告:
WARNING:gensim.models.word2vec:train() called with an empty iterator (if not intended, be sure to provide a corpus that offers restartable iteration = an iterable).
WARNING:gensim.models.word2vec:supplied example count (0) did not equal expected count (30)
现在我明白了生成器和迭代器之间的区别,我传递的是迭代器,这是通过多次打印以下命令来验证的:
print(list(SentenceClass(data_dir_path)))
print(list(SentenceClass(data_dir_path)))
print(list(bigram_transformer[SentenceClass(data_dir_path)]))
print(list(bigram_transformer[SentenceClass(data_dir_path)]))
它打印的东西很好,但我仍然不确定为什么第二种情况会出现 "empty iterator" 警告,我在这里遗漏了什么吗?
我意识到 Phrases 和 phrases 都只是一个生成器,需要以下 class
from gensim.models import Word2Vec, Phrases, phrases
class PhraseItertor(object):
def __init__(self, my_phraser, data):
self.my_phraser, self.data = my_phraser, data
def __iter__(self):
return self.my_phraser[self.data]
my_sentences = SentenceClass(data_dir_path)
my_phrases = Phrases(my_sentences, min_count=1)
bigram = phrases.Phraser(my_phrases)
my_corpus = PhraseItertor(bigram,my_sentences)
model = Word2Vec(my_corpus, size=100, window=5, min_count=1, workers=4)
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec, Phrases
class SentenceClass(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
with open(os.path.join(self.dirname,fname), 'r') as myfile:
doc = myfile.read().replace('\n', ' ')
for sent in tokenize.sent_tokenize(doc.lower()):
yield [Stemming.stem(word)\
for word in word_tokenize(re.sub("[^A-Za-z]", " ",sent))\
if word not in stopwords]
现在的两种方法:
1)
model = Word2Vec(SentenceClass(data_dir_path), size=100, window=5, min_count=1, workers=4)
上面的运行非常好,没有警告
2)
bigram_transformer = Phrases(SentenceClass(data_dir_path), min_count=1)
model = Word2Vec(bigram_transformer[SentenceClass(data_dir_path)], size=100, window=5, min_count=1, workers=4)
产生警告:
WARNING:gensim.models.word2vec:train() called with an empty iterator (if not intended, be sure to provide a corpus that offers restartable iteration = an iterable).
WARNING:gensim.models.word2vec:supplied example count (0) did not equal expected count (30)
现在我明白了生成器和迭代器之间的区别,我传递的是迭代器,这是通过多次打印以下命令来验证的:
print(list(SentenceClass(data_dir_path)))
print(list(SentenceClass(data_dir_path)))
print(list(bigram_transformer[SentenceClass(data_dir_path)]))
print(list(bigram_transformer[SentenceClass(data_dir_path)]))
它打印的东西很好,但我仍然不确定为什么第二种情况会出现 "empty iterator" 警告,我在这里遗漏了什么吗?
我意识到 Phrases 和 phrases 都只是一个生成器,需要以下 class
from gensim.models import Word2Vec, Phrases, phrases
class PhraseItertor(object):
def __init__(self, my_phraser, data):
self.my_phraser, self.data = my_phraser, data
def __iter__(self):
return self.my_phraser[self.data]
my_sentences = SentenceClass(data_dir_path)
my_phrases = Phrases(my_sentences, min_count=1)
bigram = phrases.Phraser(my_phrases)
my_corpus = PhraseItertor(bigram,my_sentences)
model = Word2Vec(my_corpus, size=100, window=5, min_count=1, workers=4)