如何从数据框中提取名词
How to extract nouns from dataframe
我想从数据框中提取名词。只有名词。
我做如下
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
df = pd.DataFrame({'noun': ['good day', 'good night']})
我想得到
noun
0 day
1 night
我的代码
df['noun'] = df.apply(lambda row: nltk.word_tokenize(row['noun']), axis=1)
noun=[]
for index, row in df.iterrows():
noun.append([word for word,pos in pos_tag(row) if pos == 'NN'])
df['noun'] = noun
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-194-688cfbb21ec5> in <module>()
1 noun=[]
2 for index, row in df.iterrows():
----> 3 noun.append([word for word,pos in pos_tag(row) if pos == 'NN'])
4 df['noun'] = noun
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\__init__.py in pos_tag(tokens, tagset)
109 """
110 tagger = PerceptronTagger()
--> 111 return _pos_tag(tokens, tagset, tagger)
112
113
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\__init__.py in _pos_tag(tokens, tagset, tagger)
80
81 def _pos_tag(tokens, tagset, tagger):
---> 82 tagged_tokens = tagger.tag(tokens)
83 if tagset:
84 tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\perceptron.py in tag(self, tokens)
150 output = []
151
--> 152 context = self.START + [self.normalize(w) for w in tokens] + self.END
153 for i, word in enumerate(tokens):
154 tag = self.tagdict.get(word)
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\perceptron.py in <listcomp>(.0)
150 output = []
151
--> 152 context = self.START + [self.normalize(w) for w in tokens] + self.END
153 for i, word in enumerate(tokens):
154 tag = self.tagdict.get(word)
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\perceptron.py in normalize(self, word)
222 if '-' in word and word[0] != '-':
223 return '!HYPHEN'
--> 224 elif word.isdigit() and len(word) == 4:
225 return '!YEAR'
226 elif word[0].isdigit():
AttributeError: 'list' object has no attribute 'isdigit'
求助,如何改进?
* 抱歉,我已经写了一些文本以便我可以插入所有回溯
我想问题是我无法将列表转换为所需的格式?
问题是在你的循环中,row
是一个 pandas Series
而不是一个列表。您可以通过编写 row[0]
来访问单词列表:
>>> for index, row in df.iterrows():
>>> noun.append([word for word,pos in pos_tag(row[0]) if pos == 'NN'])
>>> print(noun)
[['day'], ['night']]
此处您将获得一个列表列表,每个列表包含一个句子中的名词。如果你真的想要一个简单的列表(如你问题的示例结果),写 noun.extend(...)
而不是 noun.append
。
我想从数据框中提取名词。只有名词。 我做如下
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
df = pd.DataFrame({'noun': ['good day', 'good night']})
我想得到
noun
0 day
1 night
我的代码
df['noun'] = df.apply(lambda row: nltk.word_tokenize(row['noun']), axis=1)
noun=[]
for index, row in df.iterrows():
noun.append([word for word,pos in pos_tag(row) if pos == 'NN'])
df['noun'] = noun
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-194-688cfbb21ec5> in <module>()
1 noun=[]
2 for index, row in df.iterrows():
----> 3 noun.append([word for word,pos in pos_tag(row) if pos == 'NN'])
4 df['noun'] = noun
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\__init__.py in pos_tag(tokens, tagset)
109 """
110 tagger = PerceptronTagger()
--> 111 return _pos_tag(tokens, tagset, tagger)
112
113
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\__init__.py in _pos_tag(tokens, tagset, tagger)
80
81 def _pos_tag(tokens, tagset, tagger):
---> 82 tagged_tokens = tagger.tag(tokens)
83 if tagset:
84 tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\perceptron.py in tag(self, tokens)
150 output = []
151
--> 152 context = self.START + [self.normalize(w) for w in tokens] + self.END
153 for i, word in enumerate(tokens):
154 tag = self.tagdict.get(word)
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\perceptron.py in <listcomp>(.0)
150 output = []
151
--> 152 context = self.START + [self.normalize(w) for w in tokens] + self.END
153 for i, word in enumerate(tokens):
154 tag = self.tagdict.get(word)
C:\Users\Edward\Anaconda3\lib\site-packages\nltk\tag\perceptron.py in normalize(self, word)
222 if '-' in word and word[0] != '-':
223 return '!HYPHEN'
--> 224 elif word.isdigit() and len(word) == 4:
225 return '!YEAR'
226 elif word[0].isdigit():
AttributeError: 'list' object has no attribute 'isdigit'
求助,如何改进? * 抱歉,我已经写了一些文本以便我可以插入所有回溯 我想问题是我无法将列表转换为所需的格式?
问题是在你的循环中,row
是一个 pandas Series
而不是一个列表。您可以通过编写 row[0]
来访问单词列表:
>>> for index, row in df.iterrows():
>>> noun.append([word for word,pos in pos_tag(row[0]) if pos == 'NN'])
>>> print(noun)
[['day'], ['night']]
此处您将获得一个列表列表,每个列表包含一个句子中的名词。如果你真的想要一个简单的列表(如你问题的示例结果),写 noun.extend(...)
而不是 noun.append
。