如何改进 NB 分类器的特征选择?

How to improve my feature selection for a NB classifier?

我读到改进特征 selection 将减少我的分类器的训练时间并提高准确性,但我不确定如何减少特征的数量。我应该计算它们吗?例如,在 select 前 3000 个之后?

这是我的代码:

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj,output,pickle.HIGHEST_PROTOCOL)
        print "saved"
        ujson.dumps({"output" : "obj"})


with open('neg5000.csv','rb') as f:
    reader = csv.reader(f)
    neg_tweets = list(reader)
    print "list ready"

with open('pos5000.csv','rb') as f:
    reader = csv.reader(f)
    pos_tweets = list(reader)
    print "list ready"

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3] 
    tweets.append((words_filtered, sentiment))




def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = list(wordlist.keys())[:3000]
    #word_features = wordlist.keys()
    return word_features

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
#def extract_features(words):
 #   return dict([(word, True) for word in words])


word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)


save_object(word_features, 'wordf.save')
print 'features done'
print datetime.datetime.now()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print 'training done'
print datetime.datetime.now()

save_object(classifier, 'classifier.save')

tweet = 'I love this car'
print classifier.classify(extract_features(tweet.split()))

有多种方法可以针对监督分类问题进行特征选择(朴素贝叶斯就是这样做的)。我建议前往 scikit-learn manual 并尝试那里列出的所有内容,因为特定方法的选择取决于您拥有的数据。

最简单的方法是切换到 scikit-learn implementation of Naive Bayes and the use a Pipeline to chain the feature selection and classifier training. See this tutorial 代码示例。

这是使用 scikit-learnSelectKBest 功能选择的代码版本:

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


def read_input(path):
    with open(path) as handle:
        lines = (line.rsplit(",", 1) for line in handle)
        return [text for text, label in lines]


# Assuming each line in ``neg5000.csv`` and ``pos5000.csv`` is a
# UTF-8-encoded tweet.
neg_tweets = read_input("neg5000.csv")
pos_tweets = read_input("pos5000.csv")

X = np.append(neg_tweets, pos_tweets)
y = np.append(np.full(len(neg_tweets), -1, dtype=int),
              np.full(len(pos_tweets), 1, dtype=int))


p = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("selector", SelectPercentile(percentile=20)),
    ("nb", MultinomialNB())
])

p.fit(X, y)
print(p.predict(["I love this car"]))