Python - 将 GridSearchCV 与 NLTK 结合使用
Python - Using GridSearchCV with NLTK
我有点不确定如何将 SKLearn 的 GridSearchCV 应用到我与 NLTK 一起使用的随机森林。 here 讨论了如何正常使用 GridSearchCV,但是我的数据格式与标准 x 和 y 拆分不同。这是我的代码:
import nltk
import numpy as np
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
reader_train = CategorizedPlaintextCorpusReader('C:/Users/User/Documents/Sentiment/machine_learning/amazon/amazon/', r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin1')
documents_train = [ (list(reader_train.words(fileid)), category)
for category in reader_train.categories()
for fileid in reader_train.fileids(category) ]
all_words = []
for w in reader_train.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3500]
def find_features(documents):
words = set(documents)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets_train = [(find_features(rev), category) for (rev, category) in documents_train]
np.random.shuffle(featuresets_train)
training_set = featuresets_train[:1600]
testing_set = featuresets_train[:400]
RandFor = SklearnClassifier(RandomForestClassifier())
RandFor.train(training_set)
print("RandFor accuracy:", (nltk.classify.accuracy(RandFor, testing_set)) *100)
此代码不是生成传统的 x 和 y 拆分,而是生成一个元组列表,其中每个元组采用以下格式:
({'i': True, 'am': False, 'conflicted': False ... 'about': False}, neg)
有没有办法将 GridSearchCV 应用于这种格式的数据?
要将 training_set
转换为 scikit 可用的形式,您只需要做
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer()
X_train, y_train = list(zip(*training_set))
X_train = vectorizer.fit_transform(X_train)
X_test, y_test = list(zip(*testing_set))
X_test = vectorizer.transform(X_test)
之后你可以轻松调用
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("RandFor accuracy:", (accuracy) * 100)
我有点不确定如何将 SKLearn 的 GridSearchCV 应用到我与 NLTK 一起使用的随机森林。 here 讨论了如何正常使用 GridSearchCV,但是我的数据格式与标准 x 和 y 拆分不同。这是我的代码:
import nltk
import numpy as np
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
reader_train = CategorizedPlaintextCorpusReader('C:/Users/User/Documents/Sentiment/machine_learning/amazon/amazon/', r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin1')
documents_train = [ (list(reader_train.words(fileid)), category)
for category in reader_train.categories()
for fileid in reader_train.fileids(category) ]
all_words = []
for w in reader_train.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3500]
def find_features(documents):
words = set(documents)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets_train = [(find_features(rev), category) for (rev, category) in documents_train]
np.random.shuffle(featuresets_train)
training_set = featuresets_train[:1600]
testing_set = featuresets_train[:400]
RandFor = SklearnClassifier(RandomForestClassifier())
RandFor.train(training_set)
print("RandFor accuracy:", (nltk.classify.accuracy(RandFor, testing_set)) *100)
此代码不是生成传统的 x 和 y 拆分,而是生成一个元组列表,其中每个元组采用以下格式:
({'i': True, 'am': False, 'conflicted': False ... 'about': False}, neg)
有没有办法将 GridSearchCV 应用于这种格式的数据?
要将 training_set
转换为 scikit 可用的形式,您只需要做
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer()
X_train, y_train = list(zip(*training_set))
X_train = vectorizer.fit_transform(X_train)
X_test, y_test = list(zip(*testing_set))
X_test = vectorizer.transform(X_test)
之后你可以轻松调用
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print("RandFor accuracy:", (accuracy) * 100)