Python 2.7 上的 UnicodeDecodeError
UnicodeDecodeError on Python 2.7
有一些问题。我正在对长度为 160 万的数据集进行 TwitterSentimentAnalysis。由于我的电脑无法完成工作(由于太多的计算),教授让我使用大学服务器。
我刚刚意识到在服务器上,python 版本是 2.7,它不允许我在 csv reader 中使用参数 encoding正在读取文件。
任何时候我得到 UnicodeDecodeError
,我都必须手动从数据集中删除推文,否则我无法完成其余的工作。我已尝试继续解决网站上的所有问题,但没有解决任何问题。
我只想跳过引发错误的行,因为集合足够大,可以让我进行很好的分析。
class UTF8Recoder:
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8", errors='ignore')
class UnicodeReader:
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
'''next() -> unicode
This function reads and returns the next line as a Unicode string.
'''
row = self.reader.next()
return [unicode(s, "utf-8", errors='replace') for s in row]
def __iter__(self):
return self
def extraction(file, textCol, sentimentCol):
"The function reads the tweets"
#fp = open(file, "r",encoding="utf8")
fp = open(file, "r")
tweetreader = UnicodeReader(fp)
#tweetreader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\' )
tweets = []
for row in tweetreader:
# It takes the column in which the tweets and the sentiment are
if row[sentimentCol]=='positive' or row[sentimentCol]=='4':
tweets.append([remove_stopwords(row[textCol]), 'positive']);
else:
if row[sentimentCol]=='negative' or row[sentimentCol]=='0':
tweets.append([remove_stopwords(row[textCol]), 'negative']);
else:
if row[sentimentCol]=='irrilevant' or row[sentimentCol]=='2' or row[sentimentCol]=='neutral':
tweets.append([remove_stopwords(row[textCol]), 'neutral']);
tweets = filterWords(tweets)
fp.close()
return tweets;
错误:
Traceback (most recent call last):
File "sentimentAnalysis_v4.py", line 165, in <module>
newTweets = extraction("sentiment2.csv",5,0)
File "sentimentAnalysis_v4.py", line 47, in extraction
for row in tweetreader:
File "sentimentAnalysis_v4.py", line 29, in next
row = self.reader.next()
File "sentimentAnalysis_v4.py", line 19, in next
return self.reader.next().encode("utf-8", errors='ignore')
File "/usr/lib/python2.7/codecs.py", line 615, in next
line = self.readline()
File "/usr/lib/python2.7/codecs.py", line 530, in readline
data = self.read(readsize, firstline=True)
File "/usr/lib/python2.7/codecs.py", line 477, in read
newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xd9 in position 48: invalid continuation byte
如果您输入的数据格式不正确,我不会在此处使用 codecs
进行读取。
使用较新的io.open()
function并指定错误处理策略; 'replace'
应该做:
class ForgivingUTF8Recoder:
def __init__(self, filename, encoding):
self.reader = io.open(f, newline='', encoding=encoding, errors='replace')
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8", errors='ignore')
我将 newline
处理设置为 ''
以确保 CSV 模块能够正确处理值中的换行符。
不传递打开的文件,只传递文件名:
tweetreader = UnicodeReader(file)
这不会让您 跳过 错误行,而是通过替换无法用 U+FFFD REPLACEMENT CHARACTER 解码的字符来处理错误行;如果您想跳过整行,您仍然可以在您的列中查找这些内容。
有一些问题。我正在对长度为 160 万的数据集进行 TwitterSentimentAnalysis。由于我的电脑无法完成工作(由于太多的计算),教授让我使用大学服务器。
我刚刚意识到在服务器上,python 版本是 2.7,它不允许我在 csv reader 中使用参数 encoding正在读取文件。
任何时候我得到 UnicodeDecodeError
,我都必须手动从数据集中删除推文,否则我无法完成其余的工作。我已尝试继续解决网站上的所有问题,但没有解决任何问题。
我只想跳过引发错误的行,因为集合足够大,可以让我进行很好的分析。
class UTF8Recoder:
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8", errors='ignore')
class UnicodeReader:
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
'''next() -> unicode
This function reads and returns the next line as a Unicode string.
'''
row = self.reader.next()
return [unicode(s, "utf-8", errors='replace') for s in row]
def __iter__(self):
return self
def extraction(file, textCol, sentimentCol):
"The function reads the tweets"
#fp = open(file, "r",encoding="utf8")
fp = open(file, "r")
tweetreader = UnicodeReader(fp)
#tweetreader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\' )
tweets = []
for row in tweetreader:
# It takes the column in which the tweets and the sentiment are
if row[sentimentCol]=='positive' or row[sentimentCol]=='4':
tweets.append([remove_stopwords(row[textCol]), 'positive']);
else:
if row[sentimentCol]=='negative' or row[sentimentCol]=='0':
tweets.append([remove_stopwords(row[textCol]), 'negative']);
else:
if row[sentimentCol]=='irrilevant' or row[sentimentCol]=='2' or row[sentimentCol]=='neutral':
tweets.append([remove_stopwords(row[textCol]), 'neutral']);
tweets = filterWords(tweets)
fp.close()
return tweets;
错误:
Traceback (most recent call last):
File "sentimentAnalysis_v4.py", line 165, in <module>
newTweets = extraction("sentiment2.csv",5,0)
File "sentimentAnalysis_v4.py", line 47, in extraction
for row in tweetreader:
File "sentimentAnalysis_v4.py", line 29, in next
row = self.reader.next()
File "sentimentAnalysis_v4.py", line 19, in next
return self.reader.next().encode("utf-8", errors='ignore')
File "/usr/lib/python2.7/codecs.py", line 615, in next
line = self.readline()
File "/usr/lib/python2.7/codecs.py", line 530, in readline
data = self.read(readsize, firstline=True)
File "/usr/lib/python2.7/codecs.py", line 477, in read
newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xd9 in position 48: invalid continuation byte
如果您输入的数据格式不正确,我不会在此处使用 codecs
进行读取。
使用较新的io.open()
function并指定错误处理策略; 'replace'
应该做:
class ForgivingUTF8Recoder:
def __init__(self, filename, encoding):
self.reader = io.open(f, newline='', encoding=encoding, errors='replace')
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8", errors='ignore')
我将 newline
处理设置为 ''
以确保 CSV 模块能够正确处理值中的换行符。
不传递打开的文件,只传递文件名:
tweetreader = UnicodeReader(file)
这不会让您 跳过 错误行,而是通过替换无法用 U+FFFD REPLACEMENT CHARACTER 解码的字符来处理错误行;如果您想跳过整行,您仍然可以在您的列中查找这些内容。