python 请求:从 "www.researchgate.net" 下载 Pdf 文件时出现 403 禁止错误

python requests: 403 Forbidden Error While Downlaoding Pdf File from "www.researchgate.net"

尝试使用 python 的 requestsresearchgate.net 下载研究文章的 Pdf 时出现 403 禁止错误,尽管不需要身份验证,使用以下代码:

import requests
import shutil
dlink = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"

r = requests.get(dlink, stream=True, 
headers={
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53',
    "sec-ch-ua-platform": '"Windows"',
    "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
    "sec-ch-ua-mobile": '?0',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-User": "?1",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Cookie": "did=zp8vP15CGPkm8uJhry60BLeiaOSKZW8p51UupMljAdqP7WMnERbTEfnOEdkUTTE6; ptc=RG1.81179579133230362.1653300988; __cf_bm=VPc7zYTygZZxFutIAXPHQpRxUBRoKo5DuX4.nIiFqY4-1653300988-0-AS9iIVhVUqqi7Se51WNYTtVymQf9Bcoz4j93uLiq8GlgX73WKpoRJDlUdo0r3fsgmlKXLudkfYdv3NWQ3R10So0=; sid=JwZkeNfoeKESm08f4EtenQVKDk2nKtHGd7oj2rLJlkcwYIZBWff4LHaT6fuoKhSCkGzqyR0Hw5NoUrpTSJnIQsBdhyT7H4gjE0OLNSdOG1q6SSiq5rFPqb1UCjy8nmQS",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",    
    }
)
print(r.request.headers)
print()
if r.status_code == 200:
    with open("x.pdf", 'wb') as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
else:
    print("Error downloading the file")
    print(r.status_code) #403
    print(r.reason)      #Forbidden 

我在这里使用的 header 是从 Edge 浏览器的隐身模式请求 header 部分收集的。

感谢您的关注。

由于TLS验证失败导致被禁止。我必须更改 TLS 适配器才能使其正常工作。

import ssl
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
from urllib3.util import ssl_


CIPHERS = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"

class TLSAdapter(HTTPAdapter):
    
    def __init__(self, ssl_options=0, *args, **kwargs):
        self.ssl_options = ssl_options
        super().__init__(*args, **kwargs)
    
    def init_poolmanager(self, *args, **kwargs):
        context = ssl_.create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*args, ssl_context=context, **kwargs)


url = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
}
adapter = TLSAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)

with requests.session() as session:
    session.mount("https://www.researchgate.net/", adapter)
    response = session.get(url, headers=headers)
    print(response.status_code)  # 200