即使在添加 encoding='utf-8' 命令后,Unicode 错误也不会消失
Unicode error won't go away even after adding encoding='utf-8' command
我正在尝试从该网站抓取。但是我遇到了 unicode 错误。我做了一些搜索,它似乎是一个编码问题?但在添加 encoding='utf-8' 之后它并没有消失。不确定是什么问题。
import bs4 as bs
import urllib.request
import csv
import numpy as np
base_url = "https://www.mobygames.com/developer/sheet/view/developerId,"
url_list =[]
with open('url.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
url_list.append(row[0])
def extract(gameurl):
req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
sauce = urllib.request.urlopen(req).read()
soup = bs.BeautifulSoup(sauce,'lxml')
infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
core_list =[]
for credits in infopage:
niceHeaderTitle = credits.find_all("h1", {"class":"niceHeaderTitle"})
name = niceHeaderTitle[0].text
Titles = credits.find_all("h3", {"class":"clean"})
Titles = [title.get_text() for title in Titles]
tr = credits.find_all("tr")
for i in range(len(tr)):
row = tr[i].get_text(strip=True)
if row in Titles:
title = row
elif len(row) > 1:
games=[name,title,row]
core_list.append(games)
core_list = np.matrix(core_list)
return core_list
def csv_write(url_data):
with open ('HRdata.csv','a',encoding='utf-8') as file:
writer=csv.writer(file)
for row in url_data:
writer.writerow(row)
for url in url_list:
link = base_url + url
url_data = extract(link)
csv_write(url_data)
我以为是因为当我试图将它写入 csv 文件时,我添加了 encoding='utf-8' 但它不起作用...不知道我应该怎么做才能解决这个问题。
这是错误信息
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-22-31928933be8c> in <module>()
52 for url in url_list:
53 link = base_url + url
---> 54 url_data = extract(link)
55 csv_write(url_data)
56
<ipython-input-22-31928933be8c> in extract(gameurl)
15 def extract(gameurl):
16 req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
---> 17 sauce = urllib.request.urlopen(req).read()
18 soup = bs.BeautifulSoup(sauce,'lxml')
19 infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
C:\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
C:\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 req = meth(req)
525
--> 526 response = self._open(req, data)
527
528 # post-process response
C:\Anaconda3\lib\urllib\request.py in _open(self, req, data)
542 protocol = req.type
543 result = self._call_chain(self.handle_open, protocol, protocol +
--> 544 '_open', req)
545 if result:
546 return result
C:\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
C:\Anaconda3\lib\urllib\request.py in https_open(self, req)
1359 def https_open(self, req):
1360 return self.do_open(http.client.HTTPSConnection, req,
-> 1361 context=self._context, check_hostname=self._check_hostname)
1362
1363 https_request = AbstractHTTPHandler.do_request_
C:\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1316 try:
1317 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318 encode_chunked=req.has_header('Transfer-encoding'))
1319 except OSError as err: # timeout error
1320 raise URLError(err)
C:\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
1237 encode_chunked=False):
1238 """Send a complete request to the server."""
-> 1239 self._send_request(method, url, body, headers, encode_chunked)
1240
1241 def _send_request(self, method, url, body, headers, encode_chunked):
C:\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
1248 skips['skip_accept_encoding'] = 1
1249
-> 1250 self.putrequest(method, url, **skips)
1251
1252 # chunked encoding will happen if HTTP/1.1 is used and either
C:\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
1115
1116 # Non-ASCII characters should have been eliminated earlier
-> 1117 self._output(request.encode('ascii'))
1118
1119 if self._http_vsn == 11:
UnicodeEncodeError: 'ascii' codec can't encode characters in position 38-40: ordinal not in range(128)
http\client.py
正在尝试 ascii
编码您的 gameurl
字符串,它不能这样做,因为它包含一个不在 ascii
字符中的字符设置。
您需要 URL encode url,不包括方案 (https://
),方法是使用 urllib.parse.quote()
函数。您只需要更改此 for
循环中的第一行:
for url in url_list:
link = base_url + urllib.parse.quote(url) # just doing the end is fine in this case
url_data = extract(link)
csv_write(url_data)
或者,您可以使用流行的 Requests 模块,它可以无缝地为您解决这个问题(我强烈推荐!)。
我正在尝试从该网站抓取。但是我遇到了 unicode 错误。我做了一些搜索,它似乎是一个编码问题?但在添加 encoding='utf-8' 之后它并没有消失。不确定是什么问题。
import bs4 as bs
import urllib.request
import csv
import numpy as np
base_url = "https://www.mobygames.com/developer/sheet/view/developerId,"
url_list =[]
with open('url.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
url_list.append(row[0])
def extract(gameurl):
req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
sauce = urllib.request.urlopen(req).read()
soup = bs.BeautifulSoup(sauce,'lxml')
infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
core_list =[]
for credits in infopage:
niceHeaderTitle = credits.find_all("h1", {"class":"niceHeaderTitle"})
name = niceHeaderTitle[0].text
Titles = credits.find_all("h3", {"class":"clean"})
Titles = [title.get_text() for title in Titles]
tr = credits.find_all("tr")
for i in range(len(tr)):
row = tr[i].get_text(strip=True)
if row in Titles:
title = row
elif len(row) > 1:
games=[name,title,row]
core_list.append(games)
core_list = np.matrix(core_list)
return core_list
def csv_write(url_data):
with open ('HRdata.csv','a',encoding='utf-8') as file:
writer=csv.writer(file)
for row in url_data:
writer.writerow(row)
for url in url_list:
link = base_url + url
url_data = extract(link)
csv_write(url_data)
我以为是因为当我试图将它写入 csv 文件时,我添加了 encoding='utf-8' 但它不起作用...不知道我应该怎么做才能解决这个问题。
这是错误信息
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-22-31928933be8c> in <module>()
52 for url in url_list:
53 link = base_url + url
---> 54 url_data = extract(link)
55 csv_write(url_data)
56
<ipython-input-22-31928933be8c> in extract(gameurl)
15 def extract(gameurl):
16 req = urllib.request.Request(gameurl,headers={'User-Agent': 'Mozilla/5.0'})
---> 17 sauce = urllib.request.urlopen(req).read()
18 soup = bs.BeautifulSoup(sauce,'lxml')
19 infopage = soup.find_all("div", {"class":"col-md-8 col-lg-8"})
C:\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
C:\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 req = meth(req)
525
--> 526 response = self._open(req, data)
527
528 # post-process response
C:\Anaconda3\lib\urllib\request.py in _open(self, req, data)
542 protocol = req.type
543 result = self._call_chain(self.handle_open, protocol, protocol +
--> 544 '_open', req)
545 if result:
546 return result
C:\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
C:\Anaconda3\lib\urllib\request.py in https_open(self, req)
1359 def https_open(self, req):
1360 return self.do_open(http.client.HTTPSConnection, req,
-> 1361 context=self._context, check_hostname=self._check_hostname)
1362
1363 https_request = AbstractHTTPHandler.do_request_
C:\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1316 try:
1317 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318 encode_chunked=req.has_header('Transfer-encoding'))
1319 except OSError as err: # timeout error
1320 raise URLError(err)
C:\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
1237 encode_chunked=False):
1238 """Send a complete request to the server."""
-> 1239 self._send_request(method, url, body, headers, encode_chunked)
1240
1241 def _send_request(self, method, url, body, headers, encode_chunked):
C:\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
1248 skips['skip_accept_encoding'] = 1
1249
-> 1250 self.putrequest(method, url, **skips)
1251
1252 # chunked encoding will happen if HTTP/1.1 is used and either
C:\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
1115
1116 # Non-ASCII characters should have been eliminated earlier
-> 1117 self._output(request.encode('ascii'))
1118
1119 if self._http_vsn == 11:
UnicodeEncodeError: 'ascii' codec can't encode characters in position 38-40: ordinal not in range(128)
http\client.py
正在尝试 ascii
编码您的 gameurl
字符串,它不能这样做,因为它包含一个不在 ascii
字符中的字符设置。
您需要 URL encode url,不包括方案 (https://
),方法是使用 urllib.parse.quote()
函数。您只需要更改此 for
循环中的第一行:
for url in url_list:
link = base_url + urllib.parse.quote(url) # just doing the end is fine in this case
url_data = extract(link)
csv_write(url_data)
或者,您可以使用流行的 Requests 模块,它可以无缝地为您解决这个问题(我强烈推荐!)。