HTTPError: Internal Server Error while scraping a website
HTTPError: Internal Server Error while scraping a website
我尝试了很多方法来修复我的代码,但我总是收到 HTTPError: Internal Server Error
。
有人告诉我使用 pd.read_html
而不是 pd.read_excel
因为 excel 给我一个关于 excel 文件类型的错误,但我不确定是什么此时要做。
无论哪种方式我都会收到错误消息。
import urllib3
import requests
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import pandas as pd
import xlrd
dataframe=[]
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1="https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
for link in soup.select('a[href*=".xls"]'):
s=url1+link["href"]
print(s)
c = pd.read_excel(s)
print(c)
我不确定为什么 pd.read_excel
会引发 HTTPError,但作为解决方法,您可以先使用 requests.get
获取 Excel 文件,然后通过 [=13] 加载它=] 通过使用 BytesIO
.
创建一个 in-memory 文件对象
from io import BytesIO
import pandas as pd
import requests
import xlrd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from requests.exceptions import HTTPError
dataframe = []
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1 = "https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
for link in soup.select('a[href*=".xls"]'):
s = url1 + link["href"]
print(s)
try:
r = requests.get(s)
r.raise_for_status()
except HTTPError as e: # Some of the .xls links throw 401 errors
print(e.response.status_code, "error for", s)
continue
c = pd.read_excel(BytesIO(r.content))
print(c)
我尝试了很多方法来修复我的代码,但我总是收到 HTTPError: Internal Server Error
。
有人告诉我使用 pd.read_html
而不是 pd.read_excel
因为 excel 给我一个关于 excel 文件类型的错误,但我不确定是什么此时要做。
无论哪种方式我都会收到错误消息。
import urllib3
import requests
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import pandas as pd
import xlrd
dataframe=[]
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1="https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
for link in soup.select('a[href*=".xls"]'):
s=url1+link["href"]
print(s)
c = pd.read_excel(s)
print(c)
我不确定为什么 pd.read_excel
会引发 HTTPError,但作为解决方法,您可以先使用 requests.get
获取 Excel 文件,然后通过 [=13] 加载它=] 通过使用 BytesIO
.
from io import BytesIO
import pandas as pd
import requests
import xlrd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from requests.exceptions import HTTPError
dataframe = []
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1 = "https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
for link in soup.select('a[href*=".xls"]'):
s = url1 + link["href"]
print(s)
try:
r = requests.get(s)
r.raise_for_status()
except HTTPError as e: # Some of the .xls links throw 401 errors
print(e.response.status_code, "error for", s)
continue
c = pd.read_excel(BytesIO(r.content))
print(c)