Python Web 抓取请求导致 406 错误
Python Web Scrape Request resulting in a 406 Error
我正在努力为学校项目抓取 https://registry.verra.org/app/search/VCS/All%20Projects。我正在尝试通过复制在后台进行的 POST 请求向“下载 excel”按钮发送请求。
这是我目前的情况。
import requests
import datetime as dt
url_back = 'https://registry.verra.org/uiapi/resource/resource/search?$skip=0&count=true&$format=excel&$exportFileName=allprojects.xlsx'
data = {"program":"VCS",
"resourceStatuses":["VCS_EX_CRD_PRD_VER_REQUESTED","VCS_EX_CRD_PRD_REQUESTED",
"VCS_EX_REGISTERED","VCS_EX_REG_VER_APPR_REQUESTED",
"VCS_EX_REGISTRATION_REQUESTED","VCS_EX_REJ",
"VCS_EX_UNDER_DEVELOPMENT_CLD","VCS_EX_UNDER_DEVELOPMENT_OPN",
"VCS_EX_UNDER_VALIDATION_CLD","VCS_EX_UNDER_VALIDATION_OPN",
"VCS_EX_CRED_TRANS_FRM_OTHER_PROG","VCS_EX_WITHDRAWN"]}
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "369",
"Content-Type": "application/json",
"Cookie": "fpestid=9g1E7EZczSniadmveW8TL8DIBB_w-MDFov_fr0DQqgBD46kgkoVSzIdQHKP-hSxMbBr4tg; _ga=GA1.2.1884498504.1652482731; _gid=GA1.2.1741997157.1652482731; ASPSESSIONIDQERRTRAR=BFIILIADNEINGJAKKMCJGKKO",
"Host": "registry.verra.org",
"Origin": "https://registry.verra.org",
"Referer": "https://registry.verra.org/app/search/VCS/All%20Projects",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"sec-ch-ua-mobile": "?1",
"sec-ch-ua-platform": "Android"
}
response = requests.post(url_back, data=data, headers=headers)
print(response)
with open('dwnld.xlsx', 'wb') as f:
f.write(response.content)
但是,响应 return 每次都是 406 错误,即使我在接受行中使用“/”和有效的“User-Agent”那不应该被阻止。关于为什么我无法获得 POST 到 return 的真实响应有什么想法吗?
数据参数含义body数据为json。因此,您必须以 json 格式发送数据,格式为 header,例如 json = data
import requests
import datetime as dt
url_back = 'https://registry.verra.org/uiapi/resource/resource/search?$skip=0&count=true&$format=excel&$exportFileName=allprojects.xlsx'
data = {"program":"VCS",
"resourceStatuses":["VCS_EX_CRD_PRD_VER_REQUESTED","VCS_EX_CRD_PRD_REQUESTED",
"VCS_EX_REGISTERED","VCS_EX_REG_VER_APPR_REQUESTED",
"VCS_EX_REGISTRATION_REQUESTED","VCS_EX_REJ",
"VCS_EX_UNDER_DEVELOPMENT_CLD","VCS_EX_UNDER_DEVELOPMENT_OPN",
"VCS_EX_UNDER_VALIDATION_CLD","VCS_EX_UNDER_VALIDATION_OPN",
"VCS_EX_CRED_TRANS_FRM_OTHER_PROG","VCS_EX_WITHDRAWN"]}
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "369",
"Content-Type": "application/json",
"Cookie": "fpestid=9g1E7EZczSniadmveW8TL8DIBB_w-MDFov_fr0DQqgBD46kgkoVSzIdQHKP-hSxMbBr4tg; _ga=GA1.2.1884498504.1652482731; _gid=GA1.2.1741997157.1652482731; ASPSESSIONIDQERRTRAR=BFIILIADNEINGJAKKMCJGKKO",
"Host": "registry.verra.org",
"Origin": "https://registry.verra.org",
"Referer": "https://registry.verra.org/app/search/VCS/All%20Projects",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"sec-ch-ua-mobile": "?1",
"sec-ch-ua-platform": "Android"
}
response = requests.post(url_back, json=data, headers=headers)
print(response)
# with open('dwnld.xlsx', 'wb') as f:
# f.write(response.content)
尝试使用 json=
参数而不是 data=
。 headers=
没有必要:
import requests
url = "https://registry.verra.org/uiapi/resource/resource/search?%24skip=0&count=true&%24format=excel&%24exportFileName=allprojects.xlsx"
payload = {
"program": "VCS",
"resourceStatuses": [
"VCS_EX_CRD_PRD_VER_REQUESTED",
"VCS_EX_CRD_PRD_REQUESTED",
"VCS_EX_REGISTERED",
"VCS_EX_REG_VER_APPR_REQUESTED",
"VCS_EX_REGISTRATION_REQUESTED",
"VCS_EX_REJ",
"VCS_EX_UNDER_DEVELOPMENT_CLD",
"VCS_EX_UNDER_DEVELOPMENT_OPN",
"VCS_EX_UNDER_VALIDATION_CLD",
"VCS_EX_UNDER_VALIDATION_OPN",
"VCS_EX_CRED_TRANS_FRM_OTHER_PROG",
"VCS_EX_WITHDRAWN",
],
}
with open("dwnld.xlsx", "wb") as f_out:
f_out.write(requests.post(url, json=payload).content)
保存 dwnld.xlsx
(来自 LibreOffice 的屏幕截图):
headers = {
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
...
您已告知网站您将只接受使用这些特定编码和这些特定语言的响应。
但是网站无法提供这些。所以它returns406,告诉你它不能满足你的要求。
我正在努力为学校项目抓取 https://registry.verra.org/app/search/VCS/All%20Projects。我正在尝试通过复制在后台进行的 POST 请求向“下载 excel”按钮发送请求。
这是我目前的情况。
import requests
import datetime as dt
url_back = 'https://registry.verra.org/uiapi/resource/resource/search?$skip=0&count=true&$format=excel&$exportFileName=allprojects.xlsx'
data = {"program":"VCS",
"resourceStatuses":["VCS_EX_CRD_PRD_VER_REQUESTED","VCS_EX_CRD_PRD_REQUESTED",
"VCS_EX_REGISTERED","VCS_EX_REG_VER_APPR_REQUESTED",
"VCS_EX_REGISTRATION_REQUESTED","VCS_EX_REJ",
"VCS_EX_UNDER_DEVELOPMENT_CLD","VCS_EX_UNDER_DEVELOPMENT_OPN",
"VCS_EX_UNDER_VALIDATION_CLD","VCS_EX_UNDER_VALIDATION_OPN",
"VCS_EX_CRED_TRANS_FRM_OTHER_PROG","VCS_EX_WITHDRAWN"]}
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "369",
"Content-Type": "application/json",
"Cookie": "fpestid=9g1E7EZczSniadmveW8TL8DIBB_w-MDFov_fr0DQqgBD46kgkoVSzIdQHKP-hSxMbBr4tg; _ga=GA1.2.1884498504.1652482731; _gid=GA1.2.1741997157.1652482731; ASPSESSIONIDQERRTRAR=BFIILIADNEINGJAKKMCJGKKO",
"Host": "registry.verra.org",
"Origin": "https://registry.verra.org",
"Referer": "https://registry.verra.org/app/search/VCS/All%20Projects",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"sec-ch-ua-mobile": "?1",
"sec-ch-ua-platform": "Android"
}
response = requests.post(url_back, data=data, headers=headers)
print(response)
with open('dwnld.xlsx', 'wb') as f:
f.write(response.content)
但是,响应 return 每次都是 406 错误,即使我在接受行中使用“/”和有效的“User-Agent”那不应该被阻止。关于为什么我无法获得 POST 到 return 的真实响应有什么想法吗?
数据参数含义body数据为json。因此,您必须以 json 格式发送数据,格式为 header,例如 json = data
import requests
import datetime as dt
url_back = 'https://registry.verra.org/uiapi/resource/resource/search?$skip=0&count=true&$format=excel&$exportFileName=allprojects.xlsx'
data = {"program":"VCS",
"resourceStatuses":["VCS_EX_CRD_PRD_VER_REQUESTED","VCS_EX_CRD_PRD_REQUESTED",
"VCS_EX_REGISTERED","VCS_EX_REG_VER_APPR_REQUESTED",
"VCS_EX_REGISTRATION_REQUESTED","VCS_EX_REJ",
"VCS_EX_UNDER_DEVELOPMENT_CLD","VCS_EX_UNDER_DEVELOPMENT_OPN",
"VCS_EX_UNDER_VALIDATION_CLD","VCS_EX_UNDER_VALIDATION_OPN",
"VCS_EX_CRED_TRANS_FRM_OTHER_PROG","VCS_EX_WITHDRAWN"]}
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "369",
"Content-Type": "application/json",
"Cookie": "fpestid=9g1E7EZczSniadmveW8TL8DIBB_w-MDFov_fr0DQqgBD46kgkoVSzIdQHKP-hSxMbBr4tg; _ga=GA1.2.1884498504.1652482731; _gid=GA1.2.1741997157.1652482731; ASPSESSIONIDQERRTRAR=BFIILIADNEINGJAKKMCJGKKO",
"Host": "registry.verra.org",
"Origin": "https://registry.verra.org",
"Referer": "https://registry.verra.org/app/search/VCS/All%20Projects",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"sec-ch-ua-mobile": "?1",
"sec-ch-ua-platform": "Android"
}
response = requests.post(url_back, json=data, headers=headers)
print(response)
# with open('dwnld.xlsx', 'wb') as f:
# f.write(response.content)
尝试使用 json=
参数而不是 data=
。 headers=
没有必要:
import requests
url = "https://registry.verra.org/uiapi/resource/resource/search?%24skip=0&count=true&%24format=excel&%24exportFileName=allprojects.xlsx"
payload = {
"program": "VCS",
"resourceStatuses": [
"VCS_EX_CRD_PRD_VER_REQUESTED",
"VCS_EX_CRD_PRD_REQUESTED",
"VCS_EX_REGISTERED",
"VCS_EX_REG_VER_APPR_REQUESTED",
"VCS_EX_REGISTRATION_REQUESTED",
"VCS_EX_REJ",
"VCS_EX_UNDER_DEVELOPMENT_CLD",
"VCS_EX_UNDER_DEVELOPMENT_OPN",
"VCS_EX_UNDER_VALIDATION_CLD",
"VCS_EX_UNDER_VALIDATION_OPN",
"VCS_EX_CRED_TRANS_FRM_OTHER_PROG",
"VCS_EX_WITHDRAWN",
],
}
with open("dwnld.xlsx", "wb") as f_out:
f_out.write(requests.post(url, json=payload).content)
保存 dwnld.xlsx
(来自 LibreOffice 的屏幕截图):
headers = {
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
...
您已告知网站您将只接受使用这些特定编码和这些特定语言的响应。
但是网站无法提供这些。所以它returns406,告诉你它不能满足你的要求。