spaCy 和 scikit-learn 向量化器
spaCy and scikit-learn vectorizer
我基于他们的 example 使用 spaCy 为 scikit-learn 写了一个引理分词器,它可以独立运行:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
class LemmaTokenizer(object):
def __init__(self):
self.spacynlp = spacy.load('en')
def __call__(self, doc):
nlpdoc = self.spacynlp(doc)
nlpdoc = [token.lemma_ for token in nlpdoc if (len(token.lemma_) > 1) or (token.lemma_.isalnum()) ]
return nlpdoc
vect = TfidfVectorizer(tokenizer=LemmaTokenizer())
vect.fit(['Apples and oranges are tasty.'])
print(vect.vocabulary_)
### prints {'apple': 1, 'and': 0, 'tasty': 4, 'be': 2, 'orange': 3}
但是,在 GridSearchCV
中使用它会出错,下面是一个独立的示例:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
wordvect = TfidfVectorizer(analyzer='word', strip_accents='ascii', tokenizer=LemmaTokenizer())
classifier = OneVsRestClassifier(SVC(kernel='linear'))
pipeline = Pipeline([('vect', wordvect), ('classifier', classifier)])
parameters = {'vect__min_df': [1, 2], 'vect__max_df': [0.7, 0.8], 'classifier__estimator__C': [0.1, 1, 10]}
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=7, verbose=1)
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.graphics', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True, categories=categories)
X = newsgroups.data
y = newsgroups.target
gs_clf = gs_clf.fit(X, y)
### AttributeError: 'spacy.tokenizer.Tokenizer' object has no attribute '_prefix_re'
当我在 tokenizer 的构造函数之外加载 spacy 时,错误没有出现,然后 GridSearchCV
运行:
spacynlp = spacy.load('en')
class LemmaTokenizer(object):
def __call__(self, doc):
nlpdoc = spacynlp(doc)
nlpdoc = [token.lemma_ for token in nlpdoc if (len(token.lemma_) > 1) or (token.lemma_.isalnum()) ]
return nlpdoc
但这意味着我的每个 n_jobs
来自 GridSearchCV
将访问和调用相同的 spacynlp 对象,它在这些作业之间共享,这留下了问题:
- 来自
spacy.load('en')
的 spacynlp 对象可以安全地被 GridSearchCV 中的多个作业使用吗?
- 这是在 scikit-learn 的分词器中调用 spacy 的正确方法吗?
运行宁 Spacy 为网格中的每个参数设置浪费时间。内存开销也很大。您应该 运行 所有数据通过 Spacy 一次并将其保存到磁盘,然后使用一个简化的向量化器读取预词形化数据。查看 TfidfVectorizer
的 tokenizer
、analyser
和 preprocessor
参数。有很多关于堆栈溢出的示例展示了如何构建自定义向量化器。
根据 mbatchkarov 的 post 的评论,我尝试通过 Spacy 运行 将我的所有文档编成 pandas 系列一次用于标记化和词形还原,并首先将其保存到磁盘。
然后,我加载了词形化的 spacy Doc
对象,为每个文档提取了一个标记列表,并将其作为输入提供给由简化的 TfidfVectorizer
和 DecisionTreeClassifier
组成的管道。
我 运行 pipeline
和 GridSearchCV
并提取最佳估计器和相应的参数。
看例子:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("de_core_news_sm") # define your language model
# adjust attributes to your liking:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
for doc in nlp.pipe(df['articleDocument'].str.lower()):
doc_bin.add(doc)
# either save DocBin to a bytes object, or...
#bytes_data = doc_bin.to_bytes()
# save DocBin to a file on disc
file_name_spacy = 'output/preprocessed_documents.spacy'
doc_bin.to_disk(file_name_spacy)
#Load DocBin at later time or on different system from disc or bytes object
#doc_bin = DocBin().from_bytes(bytes_data)
doc_bin = DocBin().from_disk(file_name_spacy)
docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))
tokenized_lemmatized_texts = [[token.lemma_ for token in doc
if not token.is_stop and not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
for doc in docs]
# classifier to use
clf = tree.DecisionTreeClassifier()
# just some random target response
y = np.random.randint(2, size=len(docs))
vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=3000)
pipeline = Pipeline([('vect', vectorizer), ('dectree', clf)])
parameters = {'dectree__max_depth':[4, 10]}
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
gs_clf.fit(tokenized_lemmatized_texts, y)
print(gs_clf.best_estimator_.get_params()['dectree'])
一些有用的资源:
我基于他们的 example 使用 spaCy 为 scikit-learn 写了一个引理分词器,它可以独立运行:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
class LemmaTokenizer(object):
def __init__(self):
self.spacynlp = spacy.load('en')
def __call__(self, doc):
nlpdoc = self.spacynlp(doc)
nlpdoc = [token.lemma_ for token in nlpdoc if (len(token.lemma_) > 1) or (token.lemma_.isalnum()) ]
return nlpdoc
vect = TfidfVectorizer(tokenizer=LemmaTokenizer())
vect.fit(['Apples and oranges are tasty.'])
print(vect.vocabulary_)
### prints {'apple': 1, 'and': 0, 'tasty': 4, 'be': 2, 'orange': 3}
但是,在 GridSearchCV
中使用它会出错,下面是一个独立的示例:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
wordvect = TfidfVectorizer(analyzer='word', strip_accents='ascii', tokenizer=LemmaTokenizer())
classifier = OneVsRestClassifier(SVC(kernel='linear'))
pipeline = Pipeline([('vect', wordvect), ('classifier', classifier)])
parameters = {'vect__min_df': [1, 2], 'vect__max_df': [0.7, 0.8], 'classifier__estimator__C': [0.1, 1, 10]}
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=7, verbose=1)
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.graphics', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True, categories=categories)
X = newsgroups.data
y = newsgroups.target
gs_clf = gs_clf.fit(X, y)
### AttributeError: 'spacy.tokenizer.Tokenizer' object has no attribute '_prefix_re'
当我在 tokenizer 的构造函数之外加载 spacy 时,错误没有出现,然后 GridSearchCV
运行:
spacynlp = spacy.load('en')
class LemmaTokenizer(object):
def __call__(self, doc):
nlpdoc = spacynlp(doc)
nlpdoc = [token.lemma_ for token in nlpdoc if (len(token.lemma_) > 1) or (token.lemma_.isalnum()) ]
return nlpdoc
但这意味着我的每个 n_jobs
来自 GridSearchCV
将访问和调用相同的 spacynlp 对象,它在这些作业之间共享,这留下了问题:
- 来自
spacy.load('en')
的 spacynlp 对象可以安全地被 GridSearchCV 中的多个作业使用吗? - 这是在 scikit-learn 的分词器中调用 spacy 的正确方法吗?
运行宁 Spacy 为网格中的每个参数设置浪费时间。内存开销也很大。您应该 运行 所有数据通过 Spacy 一次并将其保存到磁盘,然后使用一个简化的向量化器读取预词形化数据。查看 TfidfVectorizer
的 tokenizer
、analyser
和 preprocessor
参数。有很多关于堆栈溢出的示例展示了如何构建自定义向量化器。
根据 mbatchkarov 的 post 的评论,我尝试通过 Spacy 运行 将我的所有文档编成 pandas 系列一次用于标记化和词形还原,并首先将其保存到磁盘。
然后,我加载了词形化的 spacy Doc
对象,为每个文档提取了一个标记列表,并将其作为输入提供给由简化的 TfidfVectorizer
和 DecisionTreeClassifier
组成的管道。
我 运行 pipeline
和 GridSearchCV
并提取最佳估计器和相应的参数。
看例子:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("de_core_news_sm") # define your language model
# adjust attributes to your liking:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
for doc in nlp.pipe(df['articleDocument'].str.lower()):
doc_bin.add(doc)
# either save DocBin to a bytes object, or...
#bytes_data = doc_bin.to_bytes()
# save DocBin to a file on disc
file_name_spacy = 'output/preprocessed_documents.spacy'
doc_bin.to_disk(file_name_spacy)
#Load DocBin at later time or on different system from disc or bytes object
#doc_bin = DocBin().from_bytes(bytes_data)
doc_bin = DocBin().from_disk(file_name_spacy)
docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))
tokenized_lemmatized_texts = [[token.lemma_ for token in doc
if not token.is_stop and not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
for doc in docs]
# classifier to use
clf = tree.DecisionTreeClassifier()
# just some random target response
y = np.random.randint(2, size=len(docs))
vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=3000)
pipeline = Pipeline([('vect', vectorizer), ('dectree', clf)])
parameters = {'dectree__max_depth':[4, 10]}
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
gs_clf.fit(tokenized_lemmatized_texts, y)
print(gs_clf.best_estimator_.get_params()['dectree'])
一些有用的资源: