使用 NLTK 的 SklearnClassifier 和 ClassifierBasedPOSTagger 构建自己的基于分类器的词性标注器

Building own classifier based POS tagger using NLTK's SklearnClassifier and ClassifierBasedPOSTagger

我正在尝试使用 SklearnClassifierClassifierBasedPOSTagger 构建我自己的基于分类器的词性标注器。我试过的代码如下。

from nltk.corpus import treebank
nltk.download('treebank')

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger

bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
                                      classifier_builder=bnb.train)

# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))

# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))

此代码产生以下错误:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\Users\ABDULL~1.IMR\AppData\Local\Temp/ipykernel_6580/266992580.py in <module>
      4 
      5 bnb = SklearnClassifier(BernoulliNB())
----> 6 bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
      7                                       classifier_builder=bnb.train)
      8 

~\Miniconda3\envs\nlp_course\lib\site-packages\nltk\tag\sequential.py in __init__(self, feature_detector, train, classifier_builder, classifier, backoff, cutoff_prob, verbose)
    637 
    638         if train:
--> 639             self._train(train, classifier_builder, verbose)
    640 
    641     def choose_tag(self, tokens, index, history):

~\Miniconda3\envs\nlp_course\lib\site-packages\nltk\tag\sequential.py in _train(self, tagged_corpus, classifier_builder, verbose)
    673         if verbose:
    674             print("Training classifier ({} instances)".format(len(classifier_corpus)))
--> 675         self._classifier = classifier_builder(classifier_corpus)
    676 
    677     def __repr__(self):

~\Miniconda3\envs\nlp_course\lib\site-packages\nltk\classify\scikitlearn.py in train(self, labeled_featuresets)
    110 
    111         X, y = list(zip(*labeled_featuresets))
--> 112         X = self._vectorizer.fit_transform(X)
    113         y = self._encoder.fit_transform(y)
    114         self._clf.fit(X, y)

~\Miniconda3\envs\nlp_course\lib\site-packages\sklearn\feature_extraction\_dict_vectorizer.py in fit_transform(self, X, y)
    288             Feature vectors; always 2-d.
    289         
--> 290         return self._transform(X, fitting=True)
    291 
    292     def inverse_transform(self, X, dict_type=dict):

~\Miniconda3\envs\nlp_course\lib\site-packages\sklearn\feature_extraction\_dict_vectorizer.py in _transform(self, X, fitting)
    233                     if feature_name in vocab:
    234                         indices.append(vocab[feature_name])
--> 235                         values.append(self.dtype(v))
    236 
    237             indptr.append(len(indices))

TypeError: float() argument must be a string or a number, not 'NoneType'

如何正确操作?

根据此 issue, this is a consequence of a bug in scikit-learn. Scikit-learn's _transform method of DictVectorizer in sklearn/feature_extraction/_dict_vectorizer.py fails when the input argument X contains mappings to None. According to Tom Aarsen 的评论,我们现在可以使用以下示例来完成工作:

import nltk
from nltk.corpus import treebank

from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger

nltk.download('treebank')

data = treebank.tagged_sents()
train_data = data[:3]
test_data = data[3:]

class CustomClassifierBasedPOSTagger(ClassifierBasedPOSTagger):

    def feature_detector(self, tokens, index, history):
        return {
            key: str(value) # Ensure that the feature value is a string. Converts None to 'None'
            for key, value in super().feature_detector(tokens, index, history).items()
        }

bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = CustomClassifierBasedPOSTagger(train=train_data,
                                            classifier_builder=bnb.train,
                                            verbose=True)

sentence = "This is a sample sentence which I just made for fun."
# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))

# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))

输出如下:

[nltk_data] Downloading package treebank to C:\Users\Tom/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
Constructing training corpus for classifier.
Training classifier (58 instances)
0.09338289371682999
[('This', 'NNP'), ('is', 'NNP'), ('a', 'NNP'), ('sample', 'NNP'), ('sentence', 'NNP'), ('which', 'NNP'), ('I', 'NNP'), ('just', 'NNP'), ('made', 'NNP'), ('for', 'NNP'), ('fun', 'NNP'), ('.', 'NNP')]