Python 无法使用广告分层计时器获取源代码
Python can't get Source Code with Ads Layering Timer
我不是编码员,我需要完成的就是获得完整加载的源代码。我不久前发现了这段代码,它一直为我服务。但由于广告与计时器分层,它不适用于某些网站。
import urllib2,cookielib
site= "http://example.com" #real url edited out
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
content = page.read()
print content
但是我在 Python 2.7 控制台
中打印了这个
<html>
<head>
<script type="text/javascript">
//<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok3v=1613a3a185/"},atok:"469b082f74e88d5de78deda9ca22d249",petok:"704cf398eb73eb73e891bfef183856ace9cb873c-1500869038-1800",zone:"example.com",rocket:"a",apps:{}}];
document.write('<script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok3v=85b614c0f6/cloudflare.min.js"><'+'\/script>');}}catch(e){};
//]]></script>
<script type="text/rocketscript">
function set_cookie(){
var now = new Date();
vartime = now.getTime();
time += 19360000 * 1000;
now.setTime(time);
document.cookie='beget=begetok'+';
expires='+now.toGMTString()+';
path=/';
}
set_cookie();
location.
reload();
</script> </head><body></body></html>
我所做的被转换成一个函数并且有效!!!
def getHtml(url):
import urllib2,cookielib
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded'}
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
html = page.read()
#print html
return html;
替代方案(较慢),当我在整个互联网上寻找时,我发现您可以将 Python Selenium WebDriver 与 Firefox 或 Chrome 或 Headless PhantomJS 一起使用以获得 html 源代码。您需要将 GeckoDriver.exe 或 ChromeDriver.exe 或 PhantomJS.exe 放在 C:\Python27\Scripts\
def getHtmlViaWebDriver(url):
from selenium import webdriver
#print("Open Web Driver - External Head/less Browser PhantomJS or Firefox or Chrome")
#driver = webdriver.Firefox(executable_path=r'C:\Python27\Scripts\geckodriver.exe')
#driver = webdriver.Chrome(executable_path=r'C:\Python27\Scripts\chromedriver.exe')
driver = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomJS.exe')
html = driver.page_source.encode('utf-8')
driver.quit()
#print html
return html;
我不是编码员,我需要完成的就是获得完整加载的源代码。我不久前发现了这段代码,它一直为我服务。但由于广告与计时器分层,它不适用于某些网站。
import urllib2,cookielib
site= "http://example.com" #real url edited out
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
content = page.read()
print content
但是我在 Python 2.7 控制台
中打印了这个<html>
<head>
<script type="text/javascript">
//<![CDATA[try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok3v=1613a3a185/"},atok:"469b082f74e88d5de78deda9ca22d249",petok:"704cf398eb73eb73e891bfef183856ace9cb873c-1500869038-1800",zone:"example.com",rocket:"a",apps:{}}];
document.write('<script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dok3v=85b614c0f6/cloudflare.min.js"><'+'\/script>');}}catch(e){};
//]]></script>
<script type="text/rocketscript">
function set_cookie(){
var now = new Date();
vartime = now.getTime();
time += 19360000 * 1000;
now.setTime(time);
document.cookie='beget=begetok'+';
expires='+now.toGMTString()+';
path=/';
}
set_cookie();
location.
reload();
</script> </head><body></body></html>
我所做的被转换成一个函数并且有效!!!
def getHtml(url):
import urllib2,cookielib
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded'}
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
html = page.read()
#print html
return html;
替代方案(较慢),当我在整个互联网上寻找时,我发现您可以将 Python Selenium WebDriver 与 Firefox 或 Chrome 或 Headless PhantomJS 一起使用以获得 html 源代码。您需要将 GeckoDriver.exe 或 ChromeDriver.exe 或 PhantomJS.exe 放在 C:\Python27\Scripts\
def getHtmlViaWebDriver(url):
from selenium import webdriver
#print("Open Web Driver - External Head/less Browser PhantomJS or Firefox or Chrome")
#driver = webdriver.Firefox(executable_path=r'C:\Python27\Scripts\geckodriver.exe')
#driver = webdriver.Chrome(executable_path=r'C:\Python27\Scripts\chromedriver.exe')
driver = webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomJS.exe')
html = driver.page_source.encode('utf-8')
driver.quit()
#print html
return html;