从文本中剥离专有名词
Stripping proper nouns from text
我有一个包含几千行文本数据的 df。我正在使用 spaCy 在该 df 的单个列上执行一些 NLP,并尝试使用以下方法从我的文本数据中删除专有名词、停用词和标点符号:
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(df['TIP_all_txt'].astype('unicode').values, batch_size=9845,
n_threads=3):
if doc.is_parsed:
tokens.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
lemma.append([n.lemma_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
pos.append([n.pos_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
else:
tokens.append(None)
lemma.append(None)
pos.append(None)
df['s_tokens_all_txt'] = tokens
df['s_lemmas_all_txt'] = lemma
df['s_pos_all_txt'] = pos
df.head()
但是我收到这个错误,我不确定为什么:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-34-73578fd46847> in <module>()
6 n_threads=3):
7 if doc.is_parsed:
----> 8 tokens.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
9 lemma.append([n.lemma_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
10 pos.append([n.pos_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
<ipython-input-34-73578fd46847> in <listcomp>(.0)
6 n_threads=3):
7 if doc.is_parsed:
----> 8 tokens.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
9 lemma.append([n.lemma_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
10 pos.append([n.pos_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
AttributeError: 'spacy.tokens.token.Token' object has no attribute 'is_propn'
如果我去掉 not n.is_propn,代码会按预期运行。我用谷歌搜索并阅读了 spaCy 文档,但到目前为止还没有找到答案。
我在 Token
object 上看不到 is_propn
属性可用。
我认为您应该检查 词性类型为 PROPN
(reference):
from spacy.parts_of_speech import PROPN
def is_proper_noun(token):
if token.doc.is_tagged is False: # check if the document was POS-tagged
raise ValueError('token is not POS-tagged')
return token.pos == PROPN
添加到@alecxe 的回答中。
不需要
- 一次填充数据框的所有行。
- 填充数据框时获取单独的标记、引理和位置列表。
你可以试试:
df = pd.DataFrame(columns=['tokens', 'lemmas', 'pos'])
annotated_docs = nlp.pipe(df['TIP_all_txt'].astype('unicode').values,
batch_size=9845, n_threads=3)
for doc in annotated_docs:
if doc.is_parsed:
# Remove the tokens that you don't want.
tokens, lemmas, pos = zip(*[(tok.text, tok.lemma_, tok.pos_)
for tok in doc if not
(tok.is_punct or tok.is_stop
or tok.is_space or is_proper_noun(tok) )
]
)
# Populate the DataFrame.
df.append({'tokens':tokens, 'lemmas':lemmas, 'pos':pos})
这是 的一个更简洁的 pandas 技巧,但数据帧将占用更多内存:
df = pd.DataFrame(columns=['Tokens'])
annotated_docs = nlp.pipe(df['TIP_all_txt'].astype('unicode').values,
batch_size=9845, n_threads=3)
for doc in annotated_docs:
if doc.is_parsed:
# Remove the tokens that you don't want.
df.append([(tok.text, tok.lemma_, tok.pos_)
for tok in doc if not
(tok.is_punct or tok.is_stop
or tok.is_space or is_proper_noun(tok) )
]
)
df[['tokens', 'lemmas', 'pos']] = df['Tokens'].apply(pd.Series)
from nltk.tag import pos_tag
def proper_nouns():
tagged_sent = pos_tag(speech.split())
pn = [word for word,pos in tagged_sent if pos == 'NNP']
pn = [x.lower() for x in pn]
prn=list(set(pn))
prn= pd.DataFrame({'b_words':prn,'bucket_name':'proper noun'})
return prn
df=proper_nouns()
这里的演讲将是您的文字!
我有一个包含几千行文本数据的 df。我正在使用 spaCy 在该 df 的单个列上执行一些 NLP,并尝试使用以下方法从我的文本数据中删除专有名词、停用词和标点符号:
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(df['TIP_all_txt'].astype('unicode').values, batch_size=9845,
n_threads=3):
if doc.is_parsed:
tokens.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
lemma.append([n.lemma_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
pos.append([n.pos_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
else:
tokens.append(None)
lemma.append(None)
pos.append(None)
df['s_tokens_all_txt'] = tokens
df['s_lemmas_all_txt'] = lemma
df['s_pos_all_txt'] = pos
df.head()
但是我收到这个错误,我不确定为什么:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-34-73578fd46847> in <module>()
6 n_threads=3):
7 if doc.is_parsed:
----> 8 tokens.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
9 lemma.append([n.lemma_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
10 pos.append([n.pos_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
<ipython-input-34-73578fd46847> in <listcomp>(.0)
6 n_threads=3):
7 if doc.is_parsed:
----> 8 tokens.append([n.text for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
9 lemma.append([n.lemma_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
10 pos.append([n.pos_ for n in doc if not n.is_punct and not n.is_stop and not n.is_space and not n.is_propn])
AttributeError: 'spacy.tokens.token.Token' object has no attribute 'is_propn'
如果我去掉 not n.is_propn,代码会按预期运行。我用谷歌搜索并阅读了 spaCy 文档,但到目前为止还没有找到答案。
我在 Token
object 上看不到 is_propn
属性可用。
我认为您应该检查 词性类型为 PROPN
(reference):
from spacy.parts_of_speech import PROPN
def is_proper_noun(token):
if token.doc.is_tagged is False: # check if the document was POS-tagged
raise ValueError('token is not POS-tagged')
return token.pos == PROPN
添加到@alecxe 的回答中。
不需要
- 一次填充数据框的所有行。
- 填充数据框时获取单独的标记、引理和位置列表。
你可以试试:
df = pd.DataFrame(columns=['tokens', 'lemmas', 'pos'])
annotated_docs = nlp.pipe(df['TIP_all_txt'].astype('unicode').values,
batch_size=9845, n_threads=3)
for doc in annotated_docs:
if doc.is_parsed:
# Remove the tokens that you don't want.
tokens, lemmas, pos = zip(*[(tok.text, tok.lemma_, tok.pos_)
for tok in doc if not
(tok.is_punct or tok.is_stop
or tok.is_space or is_proper_noun(tok) )
]
)
# Populate the DataFrame.
df.append({'tokens':tokens, 'lemmas':lemmas, 'pos':pos})
这是
df = pd.DataFrame(columns=['Tokens'])
annotated_docs = nlp.pipe(df['TIP_all_txt'].astype('unicode').values,
batch_size=9845, n_threads=3)
for doc in annotated_docs:
if doc.is_parsed:
# Remove the tokens that you don't want.
df.append([(tok.text, tok.lemma_, tok.pos_)
for tok in doc if not
(tok.is_punct or tok.is_stop
or tok.is_space or is_proper_noun(tok) )
]
)
df[['tokens', 'lemmas', 'pos']] = df['Tokens'].apply(pd.Series)
from nltk.tag import pos_tag
def proper_nouns():
tagged_sent = pos_tag(speech.split())
pn = [word for word,pos in tagged_sent if pos == 'NNP']
pn = [x.lower() for x in pn]
prn=list(set(pn))
prn= pd.DataFrame({'b_words':prn,'bucket_name':'proper noun'})
return prn
df=proper_nouns()
这里的演讲将是您的文字!