重定向到 "Request Rejected" 页面而不是请求站点 html
Redirecting to the "Request Rejected" page instead of getting requiring site html
我尝试使用 Python 请求获取 html,但它重定向到“请求被拒绝”页面而不是获取请求站点 html。虽然我得到了正确的回应 url (https://www.digikey.com/en/products/result?s=N4IgTCBcDaIC4FsDOBmMAGAZmAHOnADgF4CGIAugL5A
) 我无法获得所需的 html。状态码为 200.
import requests
def search_part(part: str):
result = requests.get(
"https://www.digikey.com//en/products/result",
{"keywords": part}
)
print(result.text)
if __name__ == '__main__':
search_part("tms320f2808pza")
并返回 html:
<html><head><title>Request Rejected</title>
<script>(window.BOOMR_mq=window.BOOMR_mq||[]).push(["addVar",{"rua.upush":"false","rua.cpush":"false","rua.upre":"false","rua.cpre":"false","rua.uprl":"false","rua.cprl":"false","rua.cprf":"false","rua.trans":"","rua.cook":"false","rua.ims":"false","rua.ufprl":"false","rua.cfprl":"false","rua.isuxp":"false","rua.texp":"norulematch"}]);</script>
<script>!function(a){var e="https://s.go-mpulse.net/boomerang/",t="addEventListener";if("False"=="True")a.BOOMR_config=a.BOOMR_config||{},a.BOOMR_config.PageParams=a.BOOMR_config.PageParams||{},a.BOOMR_config.PageParams.pci=!0,e="https://s2.go-mpulse.net/boomerang/";if(window.BOOMR_API_key="YQEZK-AGCLT-A89TW-DLE3C-K72A2",function(){function n(e){a.BOOMR_onload=e&&e.timeStamp||(new Date).getTime()}if(!a.BOOMR||!a.BOOMR.version&&!a.BOOMR.snippetExecuted){a.BOOMR=a.BOOMR||{},a.BOOMR.snippetExecuted=!0;var i,_,o,r=document.createElement("iframe");if(a[t])a[t]("load",n,!1);else if(a.attachEvent)a.attachEvent("onload",n);r.src="javascript:void(0)",r.title="",r.role="presentation",(r.frameElement||r).style.cssText="width:0;height:0;border:0;display:none;",o=document.getElementsByTagName("script")[0],o.parentNode.insertBefore(r,o);try{_=r.contentWindow.document}catch(O){i=document.domain,r.src="javascript:var d=document.open();d.domain='"+i+"';void(0);",_=r.contentWindow.document}_.open()._l=function(){var a=this.createElement("script");if(i)this.domain=i;a.id="boomr-if-as",a.src=e+"YQEZK-AGCLT-A89TW-DLE3C-K72A2",BOOMR_lstart=(new Date).getTime(),this.body.appendChild(a)},_.write("<bo"+'dy onload="document._l();">'),_.close()}}(),"".length>0)if(a&&"performance"in a&&a.performance&&"function"==typeof a.performance.setResourceTimingBufferSize)a.performance.setResourceTimingBufferSize();!function(){if(BOOMR=a.BOOMR||{},BOOMR.plugins=BOOMR.plugins||{},!BOOMR.plugins.AK){var e=""=="true"?1:0,t="",n="exlquc3ikhseqypyofwq-f-9850c3132-clientnsv4-s.akamaihd.net",i="false"=="true"?2:1,_={"ak.v":"32","ak.cp":"1209416","ak.ai":parseInt("605158",10),"ak.ol":"0","ak.cr":123,"ak.ipv":4,"ak.proto":"http/1.1","ak.rid":"10054f68","ak.r":32304,"ak.a2":e,"ak.m":"a","ak.n":"essl","ak.bpcip":"37.215.10.0","ak.cport":63715,"ak.gh":"104.94.100.76","ak.quicv":"","ak.tlsv":"tls1.3","ak.0rtt":"","ak.csrc":"-","ak.acc":"reno","ak.t":"1643671917","ak.ak":"hOBiQwZUYzCg5VSAfCLimQ==SHXgIh7vYfr9yvIpHCrbnTdRJwcy+DKhr5Gf0lDkmpDeDL25BO9MtMCCEk/SZmfkQov7EVMeYDhrqAppLf9x4VaU3QJXHeZe6r2HHm94JKNKKMIANUxb8zqWfMDwIz3G60Mdh/Cyq1vT14b0h0NmYNdeKWRH8zXTOIe1TTepEKQAC41PSCfKnucscoIIyQ2uDIPjsEkyj0alikVAHj3v9Mo646Wq5PKrik/jB0c1sCQCTA3Y18RqbmclDujcZqdjZymj4XHs/KFZLlUMiSIWj4Q/CqKJV3lwluim/QRha7doBtw8kc5/85J2iLloi3etTTZWpI6fgKA7ENOUxpmRrKOP/wb1SgWuIiOHFWUpSqic6XcZmkhwjDjA6CCAjRzOe04gUmP6nrjsyzMRkQ5vMZWoqas3NjnFuVgMCT1F69o=","ak.pv":"193","ak.dpoabenc":"","ak.tf":i};if(""!==t)_["ak.ruds"]=t;var o={i:!1,av:function(e){var t="http.initiator";if(e&&(!e[t]||"spa_hard"===e[t]))_["ak.feo"]=void 0!==a.aFeoApplied?1:0,BOOMR.addVar(_)},rv:function(){var a=["ak.bpcip","ak.cport","ak.cr","ak.csrc","ak.gh","ak.ipv","ak.m","ak.n","ak.ol","ak.proto","ak.quicv","ak.tlsv","ak.0rtt","ak.r","ak.acc","ak.t","ak.tf"];BOOMR.removeVar(a)}};BOOMR.plugins.AK={akVars:_,akDNSPreFetchDomain:n,init:function(){if(!o.i){var a=BOOMR.subscribe;a("before_beacon",o.av,null,null),a("onbeacon",o.rv,null,null),o.i=!0}return this},is_complete:function(){return!0}}}}()}(window);</script></head><body>The requested URL was rejected. Please consult with your administrator.<br><br>Your support ID is: CUSBD< 7835315950939096182><br><br><a href='javascript:history.back();'>[Go Back]<script type="text/javascript" src="/gnjikxRSjnSd2gKE7-MX/m73SfwhfXiat/XiFNdAE/eAUMK/HVnEQEB"></script></body></html><script>
(function() {
var ws = new WebSocket('ws://' + window.location.host + '/jb-server-page?reloadServiceClientId=4');
ws.onmessage = function (msg) {
if (msg.data === 'reload') {
window.location.reload();
}
if (msg.data.startsWith('update-css ')) {
var messageId = msg.data.substring(11);
var links = document.getElementsByTagName('link');
for (var i = 0; i < links.length; i++) {
var link = links[i];
if (link.rel !== 'stylesheet') continue;
var clonedLink = link.cloneNode(true);
var newHref = link.href.replace(/(&|\?)jbUpdateLinksId=\d+/, "jbUpdateLinksId=" + messageId);
if (newHref !== link.href) {
clonedLink.href = newHref;
}
else {
var indexOfQuest = newHref.indexOf('?');
if (indexOfQuest >= 0) {
// to support ?foo#hash
clonedLink.href = newHref.substring(0, indexOfQuest + 1) + 'jbUpdateLinksId=' + messageId + '&' +
newHref.substring(indexOfQuest + 1);
}
else {
clonedLink.href += '?' + 'jbUpdateLinksId=' + messageId;
}
}
link.replaceWith(clonedLink);
}
}
};
})();
</script>
我清除了所需网站的 cookie,将网站添加到受信任的网站,但没有帮助。我该如何解决?
您需要在 header 中设置伪造的 User-Agent,否则您会被该网站屏蔽。这很常见,以至于我在每次抓取时都开始默认设置此 header:
import requests
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.digikey.com/en/products/category/integrated-circuits-ics/32?s=N4IgTCBcDaIC4FsDOBmMAGAZmAHOnADgF4CGIAugL5A'
resp = requests.get(url,headers=headers)
print(resp.text)
我尝试使用 Python 请求获取 html,但它重定向到“请求被拒绝”页面而不是获取请求站点 html。虽然我得到了正确的回应 url (https://www.digikey.com/en/products/result?s=N4IgTCBcDaIC4FsDOBmMAGAZmAHOnADgF4CGIAugL5A ) 我无法获得所需的 html。状态码为 200.
import requests
def search_part(part: str):
result = requests.get(
"https://www.digikey.com//en/products/result",
{"keywords": part}
)
print(result.text)
if __name__ == '__main__':
search_part("tms320f2808pza")
并返回 html:
<html><head><title>Request Rejected</title>
<script>(window.BOOMR_mq=window.BOOMR_mq||[]).push(["addVar",{"rua.upush":"false","rua.cpush":"false","rua.upre":"false","rua.cpre":"false","rua.uprl":"false","rua.cprl":"false","rua.cprf":"false","rua.trans":"","rua.cook":"false","rua.ims":"false","rua.ufprl":"false","rua.cfprl":"false","rua.isuxp":"false","rua.texp":"norulematch"}]);</script>
<script>!function(a){var e="https://s.go-mpulse.net/boomerang/",t="addEventListener";if("False"=="True")a.BOOMR_config=a.BOOMR_config||{},a.BOOMR_config.PageParams=a.BOOMR_config.PageParams||{},a.BOOMR_config.PageParams.pci=!0,e="https://s2.go-mpulse.net/boomerang/";if(window.BOOMR_API_key="YQEZK-AGCLT-A89TW-DLE3C-K72A2",function(){function n(e){a.BOOMR_onload=e&&e.timeStamp||(new Date).getTime()}if(!a.BOOMR||!a.BOOMR.version&&!a.BOOMR.snippetExecuted){a.BOOMR=a.BOOMR||{},a.BOOMR.snippetExecuted=!0;var i,_,o,r=document.createElement("iframe");if(a[t])a[t]("load",n,!1);else if(a.attachEvent)a.attachEvent("onload",n);r.src="javascript:void(0)",r.title="",r.role="presentation",(r.frameElement||r).style.cssText="width:0;height:0;border:0;display:none;",o=document.getElementsByTagName("script")[0],o.parentNode.insertBefore(r,o);try{_=r.contentWindow.document}catch(O){i=document.domain,r.src="javascript:var d=document.open();d.domain='"+i+"';void(0);",_=r.contentWindow.document}_.open()._l=function(){var a=this.createElement("script");if(i)this.domain=i;a.id="boomr-if-as",a.src=e+"YQEZK-AGCLT-A89TW-DLE3C-K72A2",BOOMR_lstart=(new Date).getTime(),this.body.appendChild(a)},_.write("<bo"+'dy onload="document._l();">'),_.close()}}(),"".length>0)if(a&&"performance"in a&&a.performance&&"function"==typeof a.performance.setResourceTimingBufferSize)a.performance.setResourceTimingBufferSize();!function(){if(BOOMR=a.BOOMR||{},BOOMR.plugins=BOOMR.plugins||{},!BOOMR.plugins.AK){var e=""=="true"?1:0,t="",n="exlquc3ikhseqypyofwq-f-9850c3132-clientnsv4-s.akamaihd.net",i="false"=="true"?2:1,_={"ak.v":"32","ak.cp":"1209416","ak.ai":parseInt("605158",10),"ak.ol":"0","ak.cr":123,"ak.ipv":4,"ak.proto":"http/1.1","ak.rid":"10054f68","ak.r":32304,"ak.a2":e,"ak.m":"a","ak.n":"essl","ak.bpcip":"37.215.10.0","ak.cport":63715,"ak.gh":"104.94.100.76","ak.quicv":"","ak.tlsv":"tls1.3","ak.0rtt":"","ak.csrc":"-","ak.acc":"reno","ak.t":"1643671917","ak.ak":"hOBiQwZUYzCg5VSAfCLimQ==SHXgIh7vYfr9yvIpHCrbnTdRJwcy+DKhr5Gf0lDkmpDeDL25BO9MtMCCEk/SZmfkQov7EVMeYDhrqAppLf9x4VaU3QJXHeZe6r2HHm94JKNKKMIANUxb8zqWfMDwIz3G60Mdh/Cyq1vT14b0h0NmYNdeKWRH8zXTOIe1TTepEKQAC41PSCfKnucscoIIyQ2uDIPjsEkyj0alikVAHj3v9Mo646Wq5PKrik/jB0c1sCQCTA3Y18RqbmclDujcZqdjZymj4XHs/KFZLlUMiSIWj4Q/CqKJV3lwluim/QRha7doBtw8kc5/85J2iLloi3etTTZWpI6fgKA7ENOUxpmRrKOP/wb1SgWuIiOHFWUpSqic6XcZmkhwjDjA6CCAjRzOe04gUmP6nrjsyzMRkQ5vMZWoqas3NjnFuVgMCT1F69o=","ak.pv":"193","ak.dpoabenc":"","ak.tf":i};if(""!==t)_["ak.ruds"]=t;var o={i:!1,av:function(e){var t="http.initiator";if(e&&(!e[t]||"spa_hard"===e[t]))_["ak.feo"]=void 0!==a.aFeoApplied?1:0,BOOMR.addVar(_)},rv:function(){var a=["ak.bpcip","ak.cport","ak.cr","ak.csrc","ak.gh","ak.ipv","ak.m","ak.n","ak.ol","ak.proto","ak.quicv","ak.tlsv","ak.0rtt","ak.r","ak.acc","ak.t","ak.tf"];BOOMR.removeVar(a)}};BOOMR.plugins.AK={akVars:_,akDNSPreFetchDomain:n,init:function(){if(!o.i){var a=BOOMR.subscribe;a("before_beacon",o.av,null,null),a("onbeacon",o.rv,null,null),o.i=!0}return this},is_complete:function(){return!0}}}}()}(window);</script></head><body>The requested URL was rejected. Please consult with your administrator.<br><br>Your support ID is: CUSBD< 7835315950939096182><br><br><a href='javascript:history.back();'>[Go Back]<script type="text/javascript" src="/gnjikxRSjnSd2gKE7-MX/m73SfwhfXiat/XiFNdAE/eAUMK/HVnEQEB"></script></body></html><script>
(function() {
var ws = new WebSocket('ws://' + window.location.host + '/jb-server-page?reloadServiceClientId=4');
ws.onmessage = function (msg) {
if (msg.data === 'reload') {
window.location.reload();
}
if (msg.data.startsWith('update-css ')) {
var messageId = msg.data.substring(11);
var links = document.getElementsByTagName('link');
for (var i = 0; i < links.length; i++) {
var link = links[i];
if (link.rel !== 'stylesheet') continue;
var clonedLink = link.cloneNode(true);
var newHref = link.href.replace(/(&|\?)jbUpdateLinksId=\d+/, "jbUpdateLinksId=" + messageId);
if (newHref !== link.href) {
clonedLink.href = newHref;
}
else {
var indexOfQuest = newHref.indexOf('?');
if (indexOfQuest >= 0) {
// to support ?foo#hash
clonedLink.href = newHref.substring(0, indexOfQuest + 1) + 'jbUpdateLinksId=' + messageId + '&' +
newHref.substring(indexOfQuest + 1);
}
else {
clonedLink.href += '?' + 'jbUpdateLinksId=' + messageId;
}
}
link.replaceWith(clonedLink);
}
}
};
})();
</script>
我清除了所需网站的 cookie,将网站添加到受信任的网站,但没有帮助。我该如何解决?
您需要在 header 中设置伪造的 User-Agent,否则您会被该网站屏蔽。这很常见,以至于我在每次抓取时都开始默认设置此 header:
import requests
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.digikey.com/en/products/category/integrated-circuits-ics/32?s=N4IgTCBcDaIC4FsDOBmMAGAZmAHOnADgF4CGIAugL5A'
resp = requests.get(url,headers=headers)
print(resp.text)