Python NLTK Word_tokenize UnicodeDecodeError
Python NLTK Word Tokenize UnicodeDecode Error
尝试以下代码时出现错误。我尝试从文本文件中读取并使用 nltk 标记单词。有任何想法吗?可以找到文本文件 here
from nltk.tokenize import word_tokenize
short_pos = open("./positive.txt","r").read()
#short_pos = short_pos.decode('utf-8').lower()
short_pos_words = word_tokenize(short_pos)
错误:
Traceback (most recent call last):
File "sentimentAnalysis.py", line 19, in <module>
short_pos_words = word_tokenize(short_pos)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 106, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 91, in sent_tokenize
return tokenizer.tokenize(text)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1226, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1274, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1265, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1304, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1280, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1325, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1460, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 6: ordinal not in range(128)
感谢您的支持。
看起来这段文字是用 Latin-1 编码的。所以这对我有用:
import codecs
with codecs.open("positive.txt", "r", "latin-1") as inputfile:
text=inputfile.read()
short_pos_words = word_tokenize(text)
print len(short_pos_words)
您可以测试不同的编码,例如在像 TextWrangler 这样的好编辑器中查看文件。你可以
1) 以不同的编码打开文件,看看哪个看起来不错
2) 查看导致问题的字符。在您的情况下,这是 位置 4645 中的字符 - 恰好是西班牙语评论中的重音词。那不是 Ascii 的一部分,所以那是行不通的;它也不是 UTF-8 中的有效代码点。
您的文件使用 "latin-1".
编码
from nltk.tokenize import word_tokenize
import codecs
with codecs.open("positive.txt", "r", "latin-1") as inputfile:
text=inputfile.read()
short_pos_words = word_tokenize(text)
print short_pos_words
尝试以下代码时出现错误。我尝试从文本文件中读取并使用 nltk 标记单词。有任何想法吗?可以找到文本文件 here
from nltk.tokenize import word_tokenize
short_pos = open("./positive.txt","r").read()
#short_pos = short_pos.decode('utf-8').lower()
short_pos_words = word_tokenize(short_pos)
错误:
Traceback (most recent call last):
File "sentimentAnalysis.py", line 19, in <module>
short_pos_words = word_tokenize(short_pos)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 106, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 91, in sent_tokenize
return tokenizer.tokenize(text)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1226, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1274, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1265, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1304, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1280, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1325, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1460, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 6: ordinal not in range(128)
感谢您的支持。
看起来这段文字是用 Latin-1 编码的。所以这对我有用:
import codecs
with codecs.open("positive.txt", "r", "latin-1") as inputfile:
text=inputfile.read()
short_pos_words = word_tokenize(text)
print len(short_pos_words)
您可以测试不同的编码,例如在像 TextWrangler 这样的好编辑器中查看文件。你可以
1) 以不同的编码打开文件,看看哪个看起来不错
2) 查看导致问题的字符。在您的情况下,这是 位置 4645 中的字符 - 恰好是西班牙语评论中的重音词。那不是 Ascii 的一部分,所以那是行不通的;它也不是 UTF-8 中的有效代码点。
您的文件使用 "latin-1".
编码from nltk.tokenize import word_tokenize
import codecs
with codecs.open("positive.txt", "r", "latin-1") as inputfile:
text=inputfile.read()
short_pos_words = word_tokenize(text)
print short_pos_words