如何使用 spaCy 进行文本预处理?
How to do text pre-processing using spaCy?
如何使用 python.
在 spaCy 中执行预处理步骤,例如停用词删除、标点符号删除、词干提取和词形还原
我在 csv 文件中有文本数据,例如段落和句子。我想做文本清理。
请通过在 pandas 数据帧中加载 csv 来举例说明
请阅读他们的文档,这里有一个例子:
只需几条命令即可轻松完成。另请注意,spacy 不支持词干提取。你可以参考这个thread
import spacy
nlp = spacy.load('en')
# sample text
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry. \
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown \
printer took a galley of type and scrambled it to make a type specimen book. It has survived not \
only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. \
It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, \
and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.\
There are many variations of passages of Lorem Ipsum available, but the majority have suffered alteration \
in some form, by injected humour, or randomised words which don't look even slightly believable. If you are \
going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the \
middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, \
making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined \
with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated \
Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc."""
# convert the text to a spacy document
document = nlp(text) # all spacy documents are tokenized. You can access them using document[i]
document[0:10] # = Lorem Ipsum is simply dummy text of the printing and
#the good thing about spacy is a lot of things like lemmatization etc are done when you convert them to a spacy document `using nlp(text)`. You can access sentences using document.sents
list(document.sents)[0]
# lemmatized words can be accessed using document[i].lemma_ and you can check
# if a word is a stopword by checking the `.is_stop` attribute of the word.
# here I am extracting the lemmatized form of each word provided they are not a stop word
lemmas = [token.lemma_ for token in document if not token.is_stop]
这可能有帮助:
import spacy #load spacy
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")
def normalize(comment, lowercase, remove_stopwords):
if lowercase:
comment = comment.lower()
comment = nlp(comment)
lemmatized = list()
for word in comment:
lemma = word.lemma_.strip()
if lemma:
if not remove_stopwords or (remove_stopwords and lemma not in stops):
lemmatized.append(lemma)
return " ".join(lemmatized)
Data['Text_After_Clean'] = Data['Text'].apply(normalize, lowercase=True, remove_stopwords=True)
迄今为止我遇到的最好的管道来自 Maksym Balatsko 的 Medium 文章 Text preprocessing steps and universal reusable pipeline。最好的部分是我们可以将它用作 scikit-learn 转换器管道的一部分并支持多进程
我修改了 Maksym 并将包保持在最低限度,并使用生成器而不是列表来避免将数据加载到内存中:
import numpy as np
import multiprocessing as mp
import string
import spacy
from sklearn.base import TransformerMixin, BaseEstimator
nlp = spacy.load("en_core_web_sm")
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self,
nlp = nlp,
n_jobs=1):
"""
Text preprocessing transformer includes steps:
1. Punctuation removal
2. Stop words removal
3. Lemmatization
nlp - spacy model
n_jobs - parallel jobs to run
"""
self.nlp = nlp
self.n_jobs = n_jobs
def fit(self, X, y=None):
return self
def transform(self, X, *_):
X_copy = X.copy()
partitions = 1
cores = mp.cpu_count()
if self.n_jobs <= -1:
partitions = cores
elif self.n_jobs <= 0:
return X_copy.apply(self._preprocess_text)
else:
partitions = min(self.n_jobs, cores)
data_split = np.array_split(X_copy, partitions)
pool = mp.Pool(cores)
data = pd.concat(pool.map(self._preprocess_part, data_split))
pool.close()
pool.join()
return data
def _preprocess_part(self, part):
return part.apply(self._preprocess_text)
def _preprocess_text(self, text):
doc = self.nlp(text)
removed_punct = self._remove_punct(doc)
removed_stop_words = self._remove_stop_words(removed_punct)
return self._lemmatize(removed_stop_words)
def _remove_punct(self, doc):
return (t for t in doc if t.text not in string.punctuation)
def _remove_stop_words(self, doc):
return (t for t in doc if not t.is_stop)
def _lemmatize(self, doc):
return ' '.join(t.lemma_ for t in doc)
您可以将其用作:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
# ... assuming data split X_train, X_test ...
clf = Pipeline(steps=[
('normalize': TextPreprocessor(n_jobs=-1)),
('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
('classifier', LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', n_jobs=-1, verbose=1))
])
clf.fit(X_train, y_train)
clf.predict(X_test)
X_train是要经过TextPreprocessing的数据,然后我们提取特征,然后传递给分类器。
如何使用 python.
在 spaCy 中执行预处理步骤,例如停用词删除、标点符号删除、词干提取和词形还原我在 csv 文件中有文本数据,例如段落和句子。我想做文本清理。
请通过在 pandas 数据帧中加载 csv 来举例说明
请阅读他们的文档,这里有一个例子:
只需几条命令即可轻松完成。另请注意,spacy 不支持词干提取。你可以参考这个thread
import spacy
nlp = spacy.load('en')
# sample text
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry. \
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown \
printer took a galley of type and scrambled it to make a type specimen book. It has survived not \
only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. \
It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, \
and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.\
There are many variations of passages of Lorem Ipsum available, but the majority have suffered alteration \
in some form, by injected humour, or randomised words which don't look even slightly believable. If you are \
going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the \
middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, \
making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined \
with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated \
Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc."""
# convert the text to a spacy document
document = nlp(text) # all spacy documents are tokenized. You can access them using document[i]
document[0:10] # = Lorem Ipsum is simply dummy text of the printing and
#the good thing about spacy is a lot of things like lemmatization etc are done when you convert them to a spacy document `using nlp(text)`. You can access sentences using document.sents
list(document.sents)[0]
# lemmatized words can be accessed using document[i].lemma_ and you can check
# if a word is a stopword by checking the `.is_stop` attribute of the word.
# here I am extracting the lemmatized form of each word provided they are not a stop word
lemmas = [token.lemma_ for token in document if not token.is_stop]
这可能有帮助:
import spacy #load spacy
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")
def normalize(comment, lowercase, remove_stopwords):
if lowercase:
comment = comment.lower()
comment = nlp(comment)
lemmatized = list()
for word in comment:
lemma = word.lemma_.strip()
if lemma:
if not remove_stopwords or (remove_stopwords and lemma not in stops):
lemmatized.append(lemma)
return " ".join(lemmatized)
Data['Text_After_Clean'] = Data['Text'].apply(normalize, lowercase=True, remove_stopwords=True)
迄今为止我遇到的最好的管道来自 Maksym Balatsko 的 Medium 文章 Text preprocessing steps and universal reusable pipeline。最好的部分是我们可以将它用作 scikit-learn 转换器管道的一部分并支持多进程
我修改了 Maksym 并将包保持在最低限度,并使用生成器而不是列表来避免将数据加载到内存中:
import numpy as np
import multiprocessing as mp
import string
import spacy
from sklearn.base import TransformerMixin, BaseEstimator
nlp = spacy.load("en_core_web_sm")
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self,
nlp = nlp,
n_jobs=1):
"""
Text preprocessing transformer includes steps:
1. Punctuation removal
2. Stop words removal
3. Lemmatization
nlp - spacy model
n_jobs - parallel jobs to run
"""
self.nlp = nlp
self.n_jobs = n_jobs
def fit(self, X, y=None):
return self
def transform(self, X, *_):
X_copy = X.copy()
partitions = 1
cores = mp.cpu_count()
if self.n_jobs <= -1:
partitions = cores
elif self.n_jobs <= 0:
return X_copy.apply(self._preprocess_text)
else:
partitions = min(self.n_jobs, cores)
data_split = np.array_split(X_copy, partitions)
pool = mp.Pool(cores)
data = pd.concat(pool.map(self._preprocess_part, data_split))
pool.close()
pool.join()
return data
def _preprocess_part(self, part):
return part.apply(self._preprocess_text)
def _preprocess_text(self, text):
doc = self.nlp(text)
removed_punct = self._remove_punct(doc)
removed_stop_words = self._remove_stop_words(removed_punct)
return self._lemmatize(removed_stop_words)
def _remove_punct(self, doc):
return (t for t in doc if t.text not in string.punctuation)
def _remove_stop_words(self, doc):
return (t for t in doc if not t.is_stop)
def _lemmatize(self, doc):
return ' '.join(t.lemma_ for t in doc)
您可以将其用作:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
# ... assuming data split X_train, X_test ...
clf = Pipeline(steps=[
('normalize': TextPreprocessor(n_jobs=-1)),
('features', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
('classifier', LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', n_jobs=-1, verbose=1))
])
clf.fit(X_train, y_train)
clf.predict(X_test)
X_train是要经过TextPreprocessing的数据,然后我们提取特征,然后传递给分类器。