用户警告:您的 stop_words 可能与您的预处理不一致
UserWarning: Your stop_words may be inconsistent with your preprocessing
我正在按照 this 教程使用以下代码制作聊天机器人。
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
# Creating the Corpus
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Tennis')
raw_html = raw_html.read()
article_html = bs.BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')
article_text = ''
for para in article_paragraphs:
article_text += para.text
article_text = article_text.lower()
# Text Preprocessing
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)
article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)
wnlemmatizer = nltk.stem.WordNetLemmatizer()
# Helper Function
def perform_lemmatization(tokens):
return [wnlemmatizer.lemmatize(token) for token in tokens]
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)
def get_processed_text(document):
return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))
# Responding to Greetings
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]
def generate_greeting_response(greeting):
for token in greeting.split():
if token.lower() in greeting_inputs:
return random.choice(greeting_responses)
else:
return 'Try again'
# Responding to User Queries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def generate_response(user_input):
tennisrobo_response = ''
article_sentences.append(user_input)
word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
all_word_vectors = word_vectorizer.fit_transform(article_sentences)
similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
similar_sentence_number = similar_vector_values.argsort()[0][-2]
matched_vector = similar_vector_values.flatten()
matched_vector.sort()
vector_matched = matched_vector[-2]
if vector_matched == 0:
tennisrobo_response = tennisrobo_response + "I am sorry, I could not understand you"
return tennisrobo_response
else:
tennisrobo_response = tennisrobo_response + article_sentences[similar_sentence_number]
return tennisrobo_response
print(generate_response('tennis'))
运行 代码,我得到以下错误:
UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.
warnings.warn('Your stop_words may be inconsistent with '
搜索 google 后,我得到了 答案的链接,说我的停用词和分词器之间可能存在不一致。但是,我是 python 和 NLTK 的新手,无法找到不一致的地方。
导致此错误的代码部分在哪里?
代码运行没有问题,请注意你得到的不是错误,而是警告。请注意,您可以使用
抑制所有警告
import warnings
warnings.filterwarnings("ignore")
出现警告是因为您使用的是自定义预处理器/分词器。请参阅调用 perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))
的 get_processed_text
方法。
如果删除词形还原,则不会看到警告:
def get_processed_text(document):
return nltk.word_tokenize(document.lower().translate(punctuation_removal))
我正在按照 this 教程使用以下代码制作聊天机器人。
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
# Creating the Corpus
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Tennis')
raw_html = raw_html.read()
article_html = bs.BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')
article_text = ''
for para in article_paragraphs:
article_text += para.text
article_text = article_text.lower()
# Text Preprocessing
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)
article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)
wnlemmatizer = nltk.stem.WordNetLemmatizer()
# Helper Function
def perform_lemmatization(tokens):
return [wnlemmatizer.lemmatize(token) for token in tokens]
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)
def get_processed_text(document):
return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))
# Responding to Greetings
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]
def generate_greeting_response(greeting):
for token in greeting.split():
if token.lower() in greeting_inputs:
return random.choice(greeting_responses)
else:
return 'Try again'
# Responding to User Queries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def generate_response(user_input):
tennisrobo_response = ''
article_sentences.append(user_input)
word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
all_word_vectors = word_vectorizer.fit_transform(article_sentences)
similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
similar_sentence_number = similar_vector_values.argsort()[0][-2]
matched_vector = similar_vector_values.flatten()
matched_vector.sort()
vector_matched = matched_vector[-2]
if vector_matched == 0:
tennisrobo_response = tennisrobo_response + "I am sorry, I could not understand you"
return tennisrobo_response
else:
tennisrobo_response = tennisrobo_response + article_sentences[similar_sentence_number]
return tennisrobo_response
print(generate_response('tennis'))
运行 代码,我得到以下错误:
UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.
warnings.warn('Your stop_words may be inconsistent with '
搜索 google 后,我得到了
导致此错误的代码部分在哪里?
代码运行没有问题,请注意你得到的不是错误,而是警告。请注意,您可以使用
抑制所有警告import warnings
warnings.filterwarnings("ignore")
出现警告是因为您使用的是自定义预处理器/分词器。请参阅调用 perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))
的 get_processed_text
方法。
如果删除词形还原,则不会看到警告:
def get_processed_text(document):
return nltk.word_tokenize(document.lower().translate(punctuation_removal))