在 Python 中使用 NLTK 对单词进行分词的问题。返回单个字母而不是单词的列表
Issue with tokenizing words with NLTK in Python. Returning lists of single letters instead of words
我的 NLP python 程序遇到了一些问题,我正在尝试创建一个包含正面和负面推文的数据集,但是当我 运行 代码时它只 returns似乎是标记化的单个字母。我是 Python 和 NLP 的新手,所以如果这是基本知识或者我解释得不好,我深表歉意。我在下面添加了我的代码:
import csv
import random
import re
import string
import mysql.connector
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
print(token)
return cleaned_tokens
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
if __name__ == "__main__":
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
negative_tweets = csv.reader(csv_file, delimiter=',')
stop_words = stopwords.words('english')
positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, 'positive')
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, 'negative')
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
CSV 文件中的片段供参考:
"tweetid","username","created_at","tweet","location","place","classification"
"1285666943073161216","MeFixerr","2020-07-21 20:04:20+00:00","Overwhelmed by all the calls, msgs and tweets. I apologize for getting lost without prior notice. Did not expect to be missed with such fervor.
I am good & taking a break. Lots of love and dua's for everyone of you in #PTIFamily ❤","Pakistan, Quetta",,"positive"
您的令牌来自文件名 ('positive_tweets.csv'),而不是文件中的数据。添加如下打印语句。你会看到这个问题。
positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)
print("tokens=", positive_tweet_tokens) # add this line
完整脚本的输出
tokens= ['positive_tweets.csv']
v
v
[('e', 3), ('v', 2), ('p', 1), ('w', 1), ('c', 1)]
[('e', 4), ('v', 2), ('n', 1), ('g', 1), ('w', 1), ('c', 1)]
Accuracy is: 0
关于第二个错误,替换为
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
negative_tweets = csv.reader(csv_file, delimiter=',')
有了这个
positive_tweets = negative_tweets = ""
with open('positive_tweets.csv') as csv_file:
positive_tweets_rdr = csv.reader(csv_file, delimiter=',')
all = list(positive_tweets_rdr)
for lst in all[1:]: positive_tweets += ' ' + lst[3] #tweet column
with open('negative_tweets.csv') as csv_file:
negative_tweets_rdr = csv.reader(csv_file, delimiter=',')
all = list(negative_tweets_rdr)
for lst in all[1:]: negative_tweets += ' ' + lst[3] #tweet column
您提供的示例代码存在几个问题:
- nltk 的
word_tokenize
采用字符串,而您提供的是惰性 csv 生成器。您可能想在 CSV 的每一行的字段之一上调用 word_tokenize
- 您的
with
语句会在您从中读取任何数据之前关闭 csv 文件
你想要这样的东西(重复负面推文)
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
positive_tweet_tokens = [word_tokenize(t[3]) for t in positive_tweets]
PS 还请确保 CSV 文件的格式正确。在上面的示例中,我天真地切掉了每行的第 4 个字段,它可能不存在。你需要一些错误处理
我的 NLP python 程序遇到了一些问题,我正在尝试创建一个包含正面和负面推文的数据集,但是当我 运行 代码时它只 returns似乎是标记化的单个字母。我是 Python 和 NLP 的新手,所以如果这是基本知识或者我解释得不好,我深表歉意。我在下面添加了我的代码:
import csv
import random
import re
import string
import mysql.connector
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
print(token)
return cleaned_tokens
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
if __name__ == "__main__":
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
negative_tweets = csv.reader(csv_file, delimiter=',')
stop_words = stopwords.words('english')
positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, 'positive')
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, 'negative')
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
CSV 文件中的片段供参考:
"tweetid","username","created_at","tweet","location","place","classification"
"1285666943073161216","MeFixerr","2020-07-21 20:04:20+00:00","Overwhelmed by all the calls, msgs and tweets. I apologize for getting lost without prior notice. Did not expect to be missed with such fervor.
I am good & taking a break. Lots of love and dua's for everyone of you in #PTIFamily ❤","Pakistan, Quetta",,"positive"
您的令牌来自文件名 ('positive_tweets.csv'),而不是文件中的数据。添加如下打印语句。你会看到这个问题。
positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)
print("tokens=", positive_tweet_tokens) # add this line
完整脚本的输出
tokens= ['positive_tweets.csv']
v
v
[('e', 3), ('v', 2), ('p', 1), ('w', 1), ('c', 1)]
[('e', 4), ('v', 2), ('n', 1), ('g', 1), ('w', 1), ('c', 1)]
Accuracy is: 0
关于第二个错误,替换为
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
negative_tweets = csv.reader(csv_file, delimiter=',')
有了这个
positive_tweets = negative_tweets = ""
with open('positive_tweets.csv') as csv_file:
positive_tweets_rdr = csv.reader(csv_file, delimiter=',')
all = list(positive_tweets_rdr)
for lst in all[1:]: positive_tweets += ' ' + lst[3] #tweet column
with open('negative_tweets.csv') as csv_file:
negative_tweets_rdr = csv.reader(csv_file, delimiter=',')
all = list(negative_tweets_rdr)
for lst in all[1:]: negative_tweets += ' ' + lst[3] #tweet column
您提供的示例代码存在几个问题:
- nltk 的
word_tokenize
采用字符串,而您提供的是惰性 csv 生成器。您可能想在 CSV 的每一行的字段之一上调用 - 您的
with
语句会在您从中读取任何数据之前关闭 csv 文件
word_tokenize
你想要这样的东西(重复负面推文)
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
positive_tweet_tokens = [word_tokenize(t[3]) for t in positive_tweets]
PS 还请确保 CSV 文件的格式正确。在上面的示例中,我天真地切掉了每行的第 4 个字段,它可能不存在。你需要一些错误处理