Python Web 抓取请求导致 406 错误

Python Web Scrape Request resulting in a 406 Error

我正在努力为学校项目抓取 https://registry.verra.org/app/search/VCS/All%20Projects。我正在尝试通过复制在后台进行的 POST 请求向“下载 excel”按钮发送请求。

这是我目前的情况。

import requests
import datetime as dt

url_back = 'https://registry.verra.org/uiapi/resource/resource/search?$skip=0&count=true&$format=excel&$exportFileName=allprojects.xlsx'
data = {"program":"VCS",
        "resourceStatuses":["VCS_EX_CRD_PRD_VER_REQUESTED","VCS_EX_CRD_PRD_REQUESTED",
                            "VCS_EX_REGISTERED","VCS_EX_REG_VER_APPR_REQUESTED",
                            "VCS_EX_REGISTRATION_REQUESTED","VCS_EX_REJ",
                            "VCS_EX_UNDER_DEVELOPMENT_CLD","VCS_EX_UNDER_DEVELOPMENT_OPN",
                            "VCS_EX_UNDER_VALIDATION_CLD","VCS_EX_UNDER_VALIDATION_OPN",
                            "VCS_EX_CRED_TRANS_FRM_OTHER_PROG","VCS_EX_WITHDRAWN"]}
headers = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Content-Length": "369",
    "Content-Type": "application/json",
    "Cookie": "fpestid=9g1E7EZczSniadmveW8TL8DIBB_w-MDFov_fr0DQqgBD46kgkoVSzIdQHKP-hSxMbBr4tg; _ga=GA1.2.1884498504.1652482731; _gid=GA1.2.1741997157.1652482731; ASPSESSIONIDQERRTRAR=BFIILIADNEINGJAKKMCJGKKO",
    "Host": "registry.verra.org",
    "Origin": "https://registry.verra.org",
    "Referer": "https://registry.verra.org/app/search/VCS/All%20Projects",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
    "sec-ch-ua-mobile": "?1",
    "sec-ch-ua-platform": "Android"
    }

response = requests.post(url_back, data=data, headers=headers)
print(response)

with open('dwnld.xlsx', 'wb') as f:
    f.write(response.content)

但是,响应 return 每次都是 406 错误,即使我在接受行中使用“/”和有效的“User-Agent”那不应该被阻止。关于为什么我无法获得 POST 到 return 的真实响应有什么想法吗?

数据参数含义body数据为json。因此,您必须以 json 格式发送数据,格式为 header,例如 json = data

import requests
import datetime as dt

url_back = 'https://registry.verra.org/uiapi/resource/resource/search?$skip=0&count=true&$format=excel&$exportFileName=allprojects.xlsx'
data = {"program":"VCS",
        "resourceStatuses":["VCS_EX_CRD_PRD_VER_REQUESTED","VCS_EX_CRD_PRD_REQUESTED",
                            "VCS_EX_REGISTERED","VCS_EX_REG_VER_APPR_REQUESTED",
                            "VCS_EX_REGISTRATION_REQUESTED","VCS_EX_REJ",
                            "VCS_EX_UNDER_DEVELOPMENT_CLD","VCS_EX_UNDER_DEVELOPMENT_OPN",
                            "VCS_EX_UNDER_VALIDATION_CLD","VCS_EX_UNDER_VALIDATION_OPN",
                            "VCS_EX_CRED_TRANS_FRM_OTHER_PROG","VCS_EX_WITHDRAWN"]}
headers = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Content-Length": "369",
    "Content-Type": "application/json",
    "Cookie": "fpestid=9g1E7EZczSniadmveW8TL8DIBB_w-MDFov_fr0DQqgBD46kgkoVSzIdQHKP-hSxMbBr4tg; _ga=GA1.2.1884498504.1652482731; _gid=GA1.2.1741997157.1652482731; ASPSESSIONIDQERRTRAR=BFIILIADNEINGJAKKMCJGKKO",
    "Host": "registry.verra.org",
    "Origin": "https://registry.verra.org",
    "Referer": "https://registry.verra.org/app/search/VCS/All%20Projects",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
    "sec-ch-ua-mobile": "?1",
    "sec-ch-ua-platform": "Android"
    }

response = requests.post(url_back, json=data, headers=headers)
print(response)

# with open('dwnld.xlsx', 'wb') as f:
#     f.write(response.content)

尝试使用 json= 参数而不是 data=headers= 没有必要:

import requests

url = "https://registry.verra.org/uiapi/resource/resource/search?%24skip=0&count=true&%24format=excel&%24exportFileName=allprojects.xlsx"

payload = {
    "program": "VCS",
    "resourceStatuses": [
        "VCS_EX_CRD_PRD_VER_REQUESTED",
        "VCS_EX_CRD_PRD_REQUESTED",
        "VCS_EX_REGISTERED",
        "VCS_EX_REG_VER_APPR_REQUESTED",
        "VCS_EX_REGISTRATION_REQUESTED",
        "VCS_EX_REJ",
        "VCS_EX_UNDER_DEVELOPMENT_CLD",
        "VCS_EX_UNDER_DEVELOPMENT_OPN",
        "VCS_EX_UNDER_VALIDATION_CLD",
        "VCS_EX_UNDER_VALIDATION_OPN",
        "VCS_EX_CRED_TRANS_FRM_OTHER_PROG",
        "VCS_EX_WITHDRAWN",
    ],
}

with open("dwnld.xlsx", "wb") as f_out:
    f_out.write(requests.post(url, json=payload).content)

保存 dwnld.xlsx(来自 LibreOffice 的屏幕截图):

headers = {
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
   ...

您已告知网站您将只接受使用这些特定编码和这些特定语言的响应。

但是网站无法提供这些。所以它returns406,告诉你它不能满足你的要求。