使用 post 请求从网站上抓取 ajax table

Scrape ajax table from website using post request

我的目标是使用 Python.
从这个 Webpage 获得 PQRI table(所列两个中的第二个 table) 因为它是 ajax table,所以我尝试了以下操作:

url = "https://apps.usp.org/ajax/USPNF/columnsDB.php"


headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "201",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": "_fbp=fb.1.1646747716384.2068133566; tc_ptid=3U21FqQ3bklFEULP2jijnQ; tc_ptidexpiry=1709819716801; BE_CLA3=p_id%3D8A64RLL6L464RLNNA48664N2RAAAAAAAAH%26bf%3D8d70551f1d08356108a60fc4a2db91d0%26bn%3D1%26bv%3D3.44%26s_expire%3D1648554934915%26s_id%3D8A64RLL6L464RJ2L8J6664N2RAAAAAAAAH; _gid=GA1.2.1041569168.1648468535; _ga_DTGQ04CR27=GS1.1.1648468535.10.0.1648468535.0; USPSESSID=u6i1i80ot1uk49mnauim3o7l37; _ga=GA1.2.1946138806.1646747717; BIGipServerprod_apps.usp.org_http_pool=1271466250.20480.0000",
"Host": "apps.usp.org",
"Origin": "https://apps.usp.org",
"Referer": "https://apps.usp.org/app/USPNF/columnsDB.html",
"sec-ch-ua": "Not A;Brand ;v=99, Chromium;v=99, Google Chrome;v=99",
"sec-ch-ua-mobile" : "?0",
"sec-ch-ua-platform": "Windows",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36",
"X-Powered-By": "CPAINT v2.1.0 :: http://sf.net/projects/cpaint",
}

payload = {
"cpaint_function": "updatePQRIResults",
"cpaint_argument[]": "Acclaim%20120%20C18",
"cpaint_argument[]": 0,
"cpaint_argument[]": 0,
"cpaint_argument[]": 0,
"cpaint_argument[]": 2.8,
"cpaint_argument[]": 0,
"cpaint_response_type": "OBJECT",
}

response = requests.post(url, data=payload, headers=headers)

我在开发人员工具中看到了所需的输出:

但是当我发出请求时,我只得到以下响应:

"getPQRIData:没有基列'0'\u003cbr\u003e\u000a"

知道我需要更改什么才能获得所需的输出吗?

您不能将表单数据作为 dictionary/json 发送。将其作为字符串发送,它应该可以工作:

import pandas as pd
import requests


s = requests.Session()
s.get('https://apps.usp.org/app/USPNF/columnsDB.html')
cookies = s.cookies.get_dict()

cookieStr = ''
for k,v in cookies.items():
    cookieStr += f'{k}={v};'

url = "https://apps.usp.org/ajax/USPNF/columnsDB.php"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Content-Length": "201",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": cookieStr,
"Host": "apps.usp.org",
"Origin": "https://apps.usp.org",
"Referer": "https://apps.usp.org/app/USPNF/columnsDB.html",
"sec-ch-ua": "Not A;Brand ;v=99, Chromium;v=99, Google Chrome;v=99",
"sec-ch-ua-mobile" : "?0",
"sec-ch-ua-platform": "Windows",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 Safari/537.36",
"X-Powered-By": "CPAINT v2.1.0 :: http://sf.net/projects/cpaint",
}

final_df = pd.DataFrame()
nextPage = True

page = 0
while nextPage == True:
    i = page*10
    payload = f'cpaint_function=updatePQRIResults&cpaint_argument[]=Acclaim%20120%20C18&cpaint_argument[]=1&cpaint_argument[]=0&cpaint_argument[]=0&cpaint_argument[]=2.8&cpaint_argument[]={i}&cpaint_response_type=OBJECT'
    
    response = s.post(url, data=payload, headers=headers).text
    
    df = pd.read_xml(response).iloc[3:-1,3:]
    
    if (df.iloc[0]['psr'] == 0) and (len(df) == 1):
        nextPage = False
        final_df = final_df.drop_duplicates().reset_index(drop=True)
        
        print('Complete')
    
    else:
        final_df = pd.concat([final_df, df], axis=0)
        
        print(f'Page: {page + 1}')
        page+=1
    

输出:

print(final_df)
       psr    psf                  psn  ...   psvb psvc28 psvc70
0      0.0   0.00      Acclaim 120 C18  ... -0.027  0.086 -0.002
1      1.0   0.24      TSKgel ODS-100Z  ... -0.031 -0.064 -0.161
2      2.0   0.67       Inertsil ODS-3  ... -0.023 -0.474 -0.334
3      3.0   0.74          LaChrom C18  ... -0.006 -0.278 -0.120
4      4.0   0.80       Prodigy ODS(3)  ... -0.012 -0.195 -0.134
..     ...    ...                  ...  ...    ...    ...    ...
753  753.0  29.55        Cosmosil 5PYE  ...  0.092  0.521  1.318
754  754.0  30.44      BioBasic Phenyl  ...  0.217  0.014  0.390
755  755.0  34.56  Microsorb-MV 100 CN  ... -0.029  0.148  0.785
756  756.0  41.62      Inertsil ODS-EP  ...  0.050 -0.620 -0.070
757  757.0  41.84           Flare C18+  ...  0.966 -0.507  1.178

[758 rows x 12 columns]