NLTK,朴素贝叶斯:为什么有些特征 NONE?
NLTK, Naive Bayes: why are some features NONE?
我正在尝试使用 NLTK 实现朴素贝叶斯。
当我打印出信息量最大的特征时,其中一些被分配 "NONE"。这是为什么?
我正在使用词袋模型:当我输出特征时,每个特征都被赋值为真。
NONE 是从哪里来的?
我看过
The feature value 'None' is reserved for unseen feature values;
此处:http://www.nltk.org/_modules/nltk/classify/naivebayes.html
这是什么意思?
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import nltk.data
from nltk.corpus import stopwords
import collections
from nltk.classify.util import accuracy
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import nltk.metrics
def bag_of_words(words):
return dict([(word, True) for word in words])
def bag_of_words_not_in_set(words, badwords):
return bag_of_words(set(words) - set(badwords))
def bag_of_words_without_stopwords(words):
badwords = stopwords.words("german")
return bag_of_words_not_in_set(words, badwords)
def label_feats_from_corpus(corp, feature_detector=bag_of_words_without_stopwords):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
train_feats = []
test_feats = []
for label, feats in lfeats.items():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
reader = CategorizedPlaintextCorpusReader('D:/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')
all_words = nltk.FreqDist(w.lower() for w in reader.words())
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
bigrams = bigram_word_feats(reader.words());
lfeats = label_feats_from_corpus(reader)
train_feats, test_feats = split_label_feats(lfeats, split=0.75)
len(train_feats)
nb_classifier = NaiveBayesClassifier.train(train_feats)
print("------------------------")
acc = accuracy(nb_classifier, test_feats)
print(acc)
print("------------------------")
feats = nb_classifier.most_informative_features(n=25)
for feat in feats:
print(feat) # some are NONE
print("------------------------")
nb_classifier.show_most_informative_features(n=25) # some are NONE
我认为 NaiveBayesClassifier
class 的完整文档说明:
If the classifier encounters an input with a feature that has
never been seen with any label, then rather than assigning a
probability of 0 to all labels, it will ignore that feature.
The feature value 'None' is reserved for unseen feature values;
you generally should not use 'None' as a feature value for one of
your own features.
如果您的数据包含从未与标签关联的特征,则该特征的值将为 None
。假设你训练了一个具有 W
、X
特征的 classifier,然后 classify 具有 W
、X
、[=16 特征的东西=].值 None
将用于特征 Z
,因为该特征在训练中从未见过。
进一步说明:
看到 None
并不让我感到惊讶,因为语言数据很少。在电影评论语料库中,会有只出现在 1 或 2 个文档中的词。例如,演员的名字或标题中的单词可能只出现在 1 条评论中。
在分析之前从语料库中删除频繁(停止)和不频繁的单词是很常见的。对于他们的科学、Blei and Lafferty (2007)主题模型,写:"The total vocabulary size in this collection is 375,144 terms. We trim the 356,195 terms that occurred fewer than 70 times as well as 296 stop words."
我正在尝试使用 NLTK 实现朴素贝叶斯。
当我打印出信息量最大的特征时,其中一些被分配 "NONE"。这是为什么?
我正在使用词袋模型:当我输出特征时,每个特征都被赋值为真。
NONE 是从哪里来的?
我看过
The feature value 'None' is reserved for unseen feature values;
此处:http://www.nltk.org/_modules/nltk/classify/naivebayes.html
这是什么意思?
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import nltk.data
from nltk.corpus import stopwords
import collections
from nltk.classify.util import accuracy
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import nltk.metrics
def bag_of_words(words):
return dict([(word, True) for word in words])
def bag_of_words_not_in_set(words, badwords):
return bag_of_words(set(words) - set(badwords))
def bag_of_words_without_stopwords(words):
badwords = stopwords.words("german")
return bag_of_words_not_in_set(words, badwords)
def label_feats_from_corpus(corp, feature_detector=bag_of_words_without_stopwords):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
train_feats = []
test_feats = []
for label, feats in lfeats.items():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
reader = CategorizedPlaintextCorpusReader('D:/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')
all_words = nltk.FreqDist(w.lower() for w in reader.words())
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
bigrams = bigram_word_feats(reader.words());
lfeats = label_feats_from_corpus(reader)
train_feats, test_feats = split_label_feats(lfeats, split=0.75)
len(train_feats)
nb_classifier = NaiveBayesClassifier.train(train_feats)
print("------------------------")
acc = accuracy(nb_classifier, test_feats)
print(acc)
print("------------------------")
feats = nb_classifier.most_informative_features(n=25)
for feat in feats:
print(feat) # some are NONE
print("------------------------")
nb_classifier.show_most_informative_features(n=25) # some are NONE
我认为 NaiveBayesClassifier
class 的完整文档说明:
If the classifier encounters an input with a feature that has never been seen with any label, then rather than assigning a probability of 0 to all labels, it will ignore that feature.
The feature value 'None' is reserved for unseen feature values; you generally should not use 'None' as a feature value for one of your own features.
如果您的数据包含从未与标签关联的特征,则该特征的值将为 None
。假设你训练了一个具有 W
、X
特征的 classifier,然后 classify 具有 W
、X
、[=16 特征的东西=].值 None
将用于特征 Z
,因为该特征在训练中从未见过。
进一步说明:
看到 None
并不让我感到惊讶,因为语言数据很少。在电影评论语料库中,会有只出现在 1 或 2 个文档中的词。例如,演员的名字或标题中的单词可能只出现在 1 条评论中。
在分析之前从语料库中删除频繁(停止)和不频繁的单词是很常见的。对于他们的科学、Blei and Lafferty (2007)主题模型,写:"The total vocabulary size in this collection is 375,144 terms. We trim the 356,195 terms that occurred fewer than 70 times as well as 296 stop words."