HTTP 错误 403:禁止 Tabula/Requests
HTTP Error 403: Forbidden with Tabula/Requests
我在使用 Tabula 时收到错误“urllib.error.HTTPError:HTTP 错误 403:禁止访问”,有没有办法解决这个问题?它在今年的大部分时间里都正常工作:
import tabula
from bs4 import BeautifulSoup
import requests
url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')
for hyperlink_tag in hyperlink_tags:
if 'Situation report' in hyperlink_tag.text:
file_path = hyperlink_tag['href']
break
latest_report = f'https://who.int/{file_path}'
file = latest_report
tables = tabula.read_pdf(file, stream=True, pages = "all", multiple_tables = True)
问题好像是最后一行所以我不确定是requests还是tabula
请求需要 headers 参数用于 User-Agent
。不确定如何使用表格添加该参数,但您可以访问 pdf 并将其写入文件,然后在以下位置阅读:
import tabula
from bs4 import BeautifulSoup
import requests
url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')
for hyperlink_tag in hyperlink_tags:
if 'Situation report' in hyperlink_tag.text:
file_path = hyperlink_tag['href']
break
latest_report = f'https://who.int/{file_path}'
file = latest_report
################################################
## Download the PDF ############################
from urllib.request import Request, urlopen
f = open('c:/test/temp.pdf', 'wb')
url_request = Request(file,
headers={"User-Agent": "Mozilla/5.0"})
webpage = urlopen(url_request).read()
f.write(webpage)
f.close()
#################################################
tables = tabula.read_pdf('c:/test/temp.pdf', stream=False, pages = "all", multiple_tables = True)
我在使用 Tabula 时收到错误“urllib.error.HTTPError:HTTP 错误 403:禁止访问”,有没有办法解决这个问题?它在今年的大部分时间里都正常工作:
import tabula
from bs4 import BeautifulSoup
import requests
url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')
for hyperlink_tag in hyperlink_tags:
if 'Situation report' in hyperlink_tag.text:
file_path = hyperlink_tag['href']
break
latest_report = f'https://who.int/{file_path}'
file = latest_report
tables = tabula.read_pdf(file, stream=True, pages = "all", multiple_tables = True)
问题好像是最后一行所以我不确定是requests还是tabula
请求需要 headers 参数用于 User-Agent
。不确定如何使用表格添加该参数,但您可以访问 pdf 并将其写入文件,然后在以下位置阅读:
import tabula
from bs4 import BeautifulSoup
import requests
url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')
for hyperlink_tag in hyperlink_tags:
if 'Situation report' in hyperlink_tag.text:
file_path = hyperlink_tag['href']
break
latest_report = f'https://who.int/{file_path}'
file = latest_report
################################################
## Download the PDF ############################
from urllib.request import Request, urlopen
f = open('c:/test/temp.pdf', 'wb')
url_request = Request(file,
headers={"User-Agent": "Mozilla/5.0"})
webpage = urlopen(url_request).read()
f.write(webpage)
f.close()
#################################################
tables = tabula.read_pdf('c:/test/temp.pdf', stream=False, pages = "all", multiple_tables = True)