如何使用 NLTK ne_chunk 提取 GPE(位置)?
How can I extract GPE(location) using NLTK ne_chunk?
我正在尝试使用 OpenWeatherMap API 和 NLTK 实现一个代码来检查特定区域的天气状况,以查找实体名称识别。但是我无法找到将 GPE 中存在的实体(给出位置)(在本例中为芝加哥)传递给我的 API 请求的方法。请帮助我使用下面给出的 syntax.The 代码。
感谢您的协助
import nltk
from nltk import load_parser
import requests
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
sentence = "What is the weather in Chicago today? "
tokens = word_tokenize(sentence)
stop_words = set(stopwords.words('english'))
clean_tokens = [w for w in tokens if not w in stop_words]
tagged = nltk.pos_tag(clean_tokens)
print(nltk.ne_chunk(tagged))
GPE
是来自预训练 ne_chunk
模型的 Tree
对象的标签。
>>> from nltk import word_tokenize, pos_tag, ne_chunk
>>> sent = "What is the weather in Chicago today?"
>>> ne_chunk(pos_tag(word_tokenize(sent)))
Tree('S', [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('weather', 'NN'), ('in', 'IN'), Tree('GPE', [('Chicago', 'NNP')]), ('today', 'NN'), ('?', '.')])
遍历树,见
也许,您正在寻找对
稍作修改的内容
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import Tree
def get_continuous_chunks(text, label):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for subtree in chunked:
if type(subtree) == Tree and subtree.label() == label:
current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
if current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
[输出]:
>>> sent = "What is the weather in New York today?"
>>> get_continuous_chunks(sent, 'GPE')
['New York']
>>> sent = "What is the weather in New York and Chicago today?"
>>> get_continuous_chunks(sent, 'GPE')
['New York', 'Chicago']
>>> sent = "What is the weather in New York"
>>> get_continuous_chunks(sent, 'GPE')
['New York']
>>> sent = "What is the weather in New York and Chicago"
>>> get_continuous_chunks(sent, 'GPE')
['New York', 'Chicago']
这是我想针对您的情况提出的解决方案:
第一步。Word_tokenize,POS_tagging,名称实体识别:代码是这样的:
Xstring = "What is the weather in New York and Chicago today?"
tokenized_doc = word_tokenize(Xstring)
tagged_sentences = nltk.pos_tag(tokenized_doc )
NE= nltk.ne_chunk(tagged_sentences )
NE.draw()
步骤 2. 名称实体识别后提取所有命名实体(如上完成)
named_entities = []
for tagged_tree in NE:
print(tagged_tree)
if hasattr(tagged_tree, 'label'):
entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
entity_type = tagged_tree.label() # get NE category
named_entities.append((entity_name, entity_type))
print(named_entities) #all entities will be printed,check at your end once
步骤 3.Now 仅提取 GPE 标签
for tag in named_entities:
#print(tag[1])
if tag[1]=='GPE': #Specify any tag which is required
print(tag)
这是我的输出:
('New York', 'GPE')
('Chicago', 'GPE')
我正在尝试使用 OpenWeatherMap API 和 NLTK 实现一个代码来检查特定区域的天气状况,以查找实体名称识别。但是我无法找到将 GPE 中存在的实体(给出位置)(在本例中为芝加哥)传递给我的 API 请求的方法。请帮助我使用下面给出的 syntax.The 代码。
感谢您的协助
import nltk
from nltk import load_parser
import requests
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
sentence = "What is the weather in Chicago today? "
tokens = word_tokenize(sentence)
stop_words = set(stopwords.words('english'))
clean_tokens = [w for w in tokens if not w in stop_words]
tagged = nltk.pos_tag(clean_tokens)
print(nltk.ne_chunk(tagged))
GPE
是来自预训练 ne_chunk
模型的 Tree
对象的标签。
>>> from nltk import word_tokenize, pos_tag, ne_chunk
>>> sent = "What is the weather in Chicago today?"
>>> ne_chunk(pos_tag(word_tokenize(sent)))
Tree('S', [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('weather', 'NN'), ('in', 'IN'), Tree('GPE', [('Chicago', 'NNP')]), ('today', 'NN'), ('?', '.')])
遍历树,见
也许,您正在寻找对
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import Tree
def get_continuous_chunks(text, label):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for subtree in chunked:
if type(subtree) == Tree and subtree.label() == label:
current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
if current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
[输出]:
>>> sent = "What is the weather in New York today?"
>>> get_continuous_chunks(sent, 'GPE')
['New York']
>>> sent = "What is the weather in New York and Chicago today?"
>>> get_continuous_chunks(sent, 'GPE')
['New York', 'Chicago']
>>> sent = "What is the weather in New York"
>>> get_continuous_chunks(sent, 'GPE')
['New York']
>>> sent = "What is the weather in New York and Chicago"
>>> get_continuous_chunks(sent, 'GPE')
['New York', 'Chicago']
这是我想针对您的情况提出的解决方案:
第一步。Word_tokenize,POS_tagging,名称实体识别:代码是这样的:
Xstring = "What is the weather in New York and Chicago today?"
tokenized_doc = word_tokenize(Xstring)
tagged_sentences = nltk.pos_tag(tokenized_doc )
NE= nltk.ne_chunk(tagged_sentences )
NE.draw()
步骤 2. 名称实体识别后提取所有命名实体(如上完成)
named_entities = []
for tagged_tree in NE:
print(tagged_tree)
if hasattr(tagged_tree, 'label'):
entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
entity_type = tagged_tree.label() # get NE category
named_entities.append((entity_name, entity_type))
print(named_entities) #all entities will be printed,check at your end once
步骤 3.Now 仅提取 GPE 标签
for tag in named_entities:
#print(tag[1])
if tag[1]=='GPE': #Specify any tag which is required
print(tag)
这是我的输出:
('New York', 'GPE')
('Chicago', 'GPE')