How to fix UnicodeDecodeError: 'ascii' codec can't decode byte?
How to fix UnicodeDecodeError: 'ascii' codec can't decode byte?
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
这是我在尝试清理我使用 spaCy 从 html 页面中提取的名称列表时遇到的错误。
我的代码:
import urllib
import requests
from bs4 import BeautifulSoup
import spacy
from spacy.en import English
from __future__ import unicode_literals
nlp_toolkit = English()
nlp = spacy.load('en')
def get_text(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
# delete unwanted tags:
for s in soup(['figure', 'script', 'style']):
s.decompose()
# use separator to separate paragraphs and subtitles!
article_soup = [s.get_text(separator="\n", strip=True) for s in soup.find_all( 'div', {'class': 'story-body__inner'})]
text = ''.join(article_soup)
return text
# using spacy
def get_names(all_tags):
names=[]
for ent in all_tags.ents:
if ent.label_=="PERSON":
names.append(str(ent))
return names
def cleaning_names(names):
new_names = [s.strip("'s") for s in names] # remove 's' from names
myset = list(set(new_names)) #remove duplicates
return myset
def main():
url = "http://www.bbc.co.uk/news/uk-politics-39784164"
text=get_text(url)
text=u"{}".format(text)
all_tags = nlp(text)
names = get_person(all_tags)
print "names:"
print names
mynewlist = cleaning_names(names)
print mynewlist
if __name__ == '__main__':
main()
对于这个特定的 URL,我得到了包含 £ 或 $:
等字符的名称列表
['Nick Clegg', 'Brexit', '\xc2\xa359bn', 'Theresa May', 'Brexit',
'Brexit', 'Mr Clegg', 'Mr Clegg', 'Mr Clegg', 'Brexit', 'Mr Clegg',
'Theresa May']
然后报错:
Traceback (most recent call last) <ipython-input-19-8582e806c94a> in <module>()
47
48 if __name__ == '__main__':
---> 49 main()
<ipython-input-19-8582e806c94a> in main()
43 print "names:"
44 print names
---> 45 mynewlist = cleaning_names(names)
46 print mynewlist
47
<ipython-input-19-8582e806c94a> in cleaning_names(names)
31
32 def cleaning_names(names):
---> 33 new_names = [s.strip("'s") for s in names] # remove 's' from names
34 myset = list(set(new_names)) #remove duplicates
35 return myset
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
我尝试了不同的修复 unicode 的方法(包括 sys.setdefaultencoding('utf8')
),但没有任何效果。我希望以前有人遇到过同样的问题,并且能够提出修复建议。谢谢!
当您使用 'ascii'
编解码器时出现解码错误,这通常表明在需要 Unicode 字符串的上下文中使用了字节字符串(在 Python 2 中,Python 3根本不允许)。
由于您已导入 from __future__ import unicode_literals
,字符串 "'s"
是 Unicode。这意味着您尝试 strip
的字符串也必须是 Unicode 字符串。解决这个问题,你就不会再收到错误了。
我终于修复了我的代码。我很惊讶它看起来多么容易,但我花了很长时间才到达那里,而且我看到很多人对同样的问题感到困惑,所以我决定 post 我的答案。
在传递名称以进一步清理之前添加这个小函数解决了我的问题。
def decode(names):
decodednames = []
for name in names:
decodednames.append(unicode(name, errors='ignore'))
return decodednames
SpaCy 仍然认为 590 亿英镑是一个人,但我觉得没问题,我可以稍后在我的代码中处理这个问题。
工作代码:
import urllib
import requests
from bs4 import BeautifulSoup
import spacy
from spacy.en import English
from __future__ import unicode_literals
nlp_toolkit = English()
nlp = spacy.load('en')
def get_text(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
# delete unwanted tags:
for s in soup(['figure', 'script', 'style']):
s.decompose()
# use separator to separate paragraphs and subtitles!
article_soup = [s.get_text(separator="\n", strip=True) for s in soup.find_all( 'div', {'class': 'story-body__inner'})]
text = ''.join(article_soup)
return text
# using spacy
def get_names(all_tags):
names=[]
for ent in all_tags.ents:
if ent.label_=="PERSON":
names.append(str(ent))
return names
def decode(names):
decodednames = []
for name in names:
decodednames.append(unicode(name, errors='ignore'))
return decodednames
def cleaning_names(names):
new_names = [s.strip("'s") for s in names] # remove 's' from names
myset = list(set(new_names)) #remove duplicates
return myset
def main():
url = "http://www.bbc.co.uk/news/uk-politics-39784164"
text=get_text(url)
text=u"{}".format(text)
all_tags = nlp(text)
names = get_person(all_tags)
print "names:"
print names
decodednames = decode(names)
mynewlist = cleaning_names(decodednames)
print mynewlist
if __name__ == '__main__':
main()
这让我没有错误:
names: ['Nick Clegg', 'Brexit', '\xc2\xa359bn', 'Theresa May',
'Brexit', 'Brexit', 'Mr Clegg', 'Mr Clegg', 'Mr Clegg', 'Brexit', 'Mr
Clegg', 'Theresa May'] [u'Mr Clegg', u'Brexit', u'Nick Clegg',
u'59bn', u'Theresa May']
正如@MarkRansom 评论的那样,忽略非 ascii 字符会反咬你一口。
先来看看
- How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
- to clean text belonging to different languages in python
此外,请注意这是一个反模式:Why should we NOT use sys.setdefaultencoding("utf-8") in a py script?
最简单的解决方案是只使用 Python3,这会减轻一些痛苦
>>> import requests
>>> from bs4 import BeautifulSoup
>>> import spacy
>>> nlp = spacy.load('en')
>>> url = "http://www.bbc.co.uk/news/uk-politics-39784164"
>>> html = requests.get(url).content
>>> bsoup = BeautifulSoup(html, 'html.parser')
>>> text = '\n'.join(p.text for d in bsoup.find_all( 'div', {'class': 'story-body__inner'}) for p in d.find_all('p') if p.text.strip())
>>> import spacy
>>> nlp = spacy.load('en')
>>> doc = nlp(text)
>>> names = [ent for ent in doc.ents if ent.ent_type_ == 'PERSON']
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
这是我在尝试清理我使用 spaCy 从 html 页面中提取的名称列表时遇到的错误。
我的代码:
import urllib
import requests
from bs4 import BeautifulSoup
import spacy
from spacy.en import English
from __future__ import unicode_literals
nlp_toolkit = English()
nlp = spacy.load('en')
def get_text(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
# delete unwanted tags:
for s in soup(['figure', 'script', 'style']):
s.decompose()
# use separator to separate paragraphs and subtitles!
article_soup = [s.get_text(separator="\n", strip=True) for s in soup.find_all( 'div', {'class': 'story-body__inner'})]
text = ''.join(article_soup)
return text
# using spacy
def get_names(all_tags):
names=[]
for ent in all_tags.ents:
if ent.label_=="PERSON":
names.append(str(ent))
return names
def cleaning_names(names):
new_names = [s.strip("'s") for s in names] # remove 's' from names
myset = list(set(new_names)) #remove duplicates
return myset
def main():
url = "http://www.bbc.co.uk/news/uk-politics-39784164"
text=get_text(url)
text=u"{}".format(text)
all_tags = nlp(text)
names = get_person(all_tags)
print "names:"
print names
mynewlist = cleaning_names(names)
print mynewlist
if __name__ == '__main__':
main()
对于这个特定的 URL,我得到了包含 £ 或 $:
等字符的名称列表['Nick Clegg', 'Brexit', '\xc2\xa359bn', 'Theresa May', 'Brexit', 'Brexit', 'Mr Clegg', 'Mr Clegg', 'Mr Clegg', 'Brexit', 'Mr Clegg', 'Theresa May']
然后报错:
Traceback (most recent call last) <ipython-input-19-8582e806c94a> in <module>()
47
48 if __name__ == '__main__':
---> 49 main()
<ipython-input-19-8582e806c94a> in main()
43 print "names:"
44 print names
---> 45 mynewlist = cleaning_names(names)
46 print mynewlist
47
<ipython-input-19-8582e806c94a> in cleaning_names(names)
31
32 def cleaning_names(names):
---> 33 new_names = [s.strip("'s") for s in names] # remove 's' from names
34 myset = list(set(new_names)) #remove duplicates
35 return myset
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
我尝试了不同的修复 unicode 的方法(包括 sys.setdefaultencoding('utf8')
),但没有任何效果。我希望以前有人遇到过同样的问题,并且能够提出修复建议。谢谢!
当您使用 'ascii'
编解码器时出现解码错误,这通常表明在需要 Unicode 字符串的上下文中使用了字节字符串(在 Python 2 中,Python 3根本不允许)。
由于您已导入 from __future__ import unicode_literals
,字符串 "'s"
是 Unicode。这意味着您尝试 strip
的字符串也必须是 Unicode 字符串。解决这个问题,你就不会再收到错误了。
我终于修复了我的代码。我很惊讶它看起来多么容易,但我花了很长时间才到达那里,而且我看到很多人对同样的问题感到困惑,所以我决定 post 我的答案。
在传递名称以进一步清理之前添加这个小函数解决了我的问题。
def decode(names):
decodednames = []
for name in names:
decodednames.append(unicode(name, errors='ignore'))
return decodednames
SpaCy 仍然认为 590 亿英镑是一个人,但我觉得没问题,我可以稍后在我的代码中处理这个问题。
工作代码:
import urllib
import requests
from bs4 import BeautifulSoup
import spacy
from spacy.en import English
from __future__ import unicode_literals
nlp_toolkit = English()
nlp = spacy.load('en')
def get_text(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
# delete unwanted tags:
for s in soup(['figure', 'script', 'style']):
s.decompose()
# use separator to separate paragraphs and subtitles!
article_soup = [s.get_text(separator="\n", strip=True) for s in soup.find_all( 'div', {'class': 'story-body__inner'})]
text = ''.join(article_soup)
return text
# using spacy
def get_names(all_tags):
names=[]
for ent in all_tags.ents:
if ent.label_=="PERSON":
names.append(str(ent))
return names
def decode(names):
decodednames = []
for name in names:
decodednames.append(unicode(name, errors='ignore'))
return decodednames
def cleaning_names(names):
new_names = [s.strip("'s") for s in names] # remove 's' from names
myset = list(set(new_names)) #remove duplicates
return myset
def main():
url = "http://www.bbc.co.uk/news/uk-politics-39784164"
text=get_text(url)
text=u"{}".format(text)
all_tags = nlp(text)
names = get_person(all_tags)
print "names:"
print names
decodednames = decode(names)
mynewlist = cleaning_names(decodednames)
print mynewlist
if __name__ == '__main__':
main()
这让我没有错误:
names: ['Nick Clegg', 'Brexit', '\xc2\xa359bn', 'Theresa May', 'Brexit', 'Brexit', 'Mr Clegg', 'Mr Clegg', 'Mr Clegg', 'Brexit', 'Mr Clegg', 'Theresa May'] [u'Mr Clegg', u'Brexit', u'Nick Clegg', u'59bn', u'Theresa May']
正如@MarkRansom 评论的那样,忽略非 ascii 字符会反咬你一口。
先来看看
- How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
- to clean text belonging to different languages in python
此外,请注意这是一个反模式:Why should we NOT use sys.setdefaultencoding("utf-8") in a py script?
最简单的解决方案是只使用 Python3,这会减轻一些痛苦
>>> import requests
>>> from bs4 import BeautifulSoup
>>> import spacy
>>> nlp = spacy.load('en')
>>> url = "http://www.bbc.co.uk/news/uk-politics-39784164"
>>> html = requests.get(url).content
>>> bsoup = BeautifulSoup(html, 'html.parser')
>>> text = '\n'.join(p.text for d in bsoup.find_all( 'div', {'class': 'story-body__inner'}) for p in d.find_all('p') if p.text.strip())
>>> import spacy
>>> nlp = spacy.load('en')
>>> doc = nlp(text)
>>> names = [ent for ent in doc.ents if ent.ent_type_ == 'PERSON']