通过 Python Selenium 从网页检索加载缓慢 HTML
Retrieving slow-to-load HTML from web page via Python Selenium
我正在尝试从 BitMEX 中提取 CSV 列表。该页面执行一些(加载速度相当慢)Javascript 以呈现目录的实际索引(我不明白他们为什么选择这样做——也许是混淆?)。
我有以下 Python 3.x 代码,它使用了 Selenium 的 Python 语言绑定:
#!/bin/python3
import datetime
from urllib import request
import sys
from sys import argv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
DOM_LOAD_WAIT = 60
COMMENT_CHAR = '#'
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def get_html(url):
# configure headlessness for the webdriver
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(DOM_LOAD_WAIT)
# constantly retry until success
while True:
try:
driver.get(url)
break
except KeyboardInterrupt:
exit(1)
except:
eprint("Retrying \"{}\"...".format(url))
continue
return driver
def get_results(url):
driver = None
try:
driver = get_html(url)
element = WebDriverWait(driver, DOM_LOAD_WAIT).until(
EC.presence_of_element_located((By.TAG_NAME, "pre"))
)
finally:
driver.quit()
print(driver.find_elements_by_tag_name("a"))
if __name__ == "__main__":
url = "https://public.bitmex.com/?prefix=data/quote/"
get_results(url.strip())
print("", end=None, flush=True) # flush stdout!
问题是脚本检索 初始 页面源,而不是我之后的最终页面源(即,在 Javascript 完全执行之后-- 这可能需要一些时间):
$ ./script.py
<html><head>
<title>public.bitmex.com</title>
<meta http-equiv="Content-Security-Policy" content="default-src 'none'; img-src 'self'; connect-src https://s3-eu-west-1.amazonaws.com; script-src 'sha384-3ceskX3iaEnIogmQchP8opvBy3Mi7Ce34nWjpBIwVTHfGYWQS9jwHDVRnpKKHJg7' 'sha384-n0cKBy1+1+ACIC9J2XunFZItQjpIi1bilP1FCayDxybB40OcUY1ipK4Qjr856KWI' 'sha384-Rncjr7coAsbMCINMdkum6h64TPVhqlDpqulDQB/a68yABAgOU21duBLDdlm86oKP'; child-src 'none'; object-src 'none'; require-sri-for script style; block-all-mixed-content;">
</head>
<body>
<div id="navigation"></div>
<div id="listing"><img src="//public.bitmex.com/ajaxload-circle.gif"></div>
<script type="text/javascript" src="https://public.bitmex.com/jquery.min.js" integrity="sha384-3ceskX3iaEnIogmQchP8opvBy3Mi7Ce34nWjpBIwVTHfGYWQS9jwHDVRnpKKHJg7" crossorigin="anonymous"></script>
<script type="text/javascript" src="https://public.bitmex.com/init.js" integrity="sha384-n0cKBy1+1+ACIC9J2XunFZItQjpIi1bilP1FCayDxybB40OcUY1ipK4Qjr856KWI" crossorigin="anonymous"></script>
<script type="text/javascript" src="https://public.bitmex.com/list.js" integrity="sha384-Rncjr7coAsbMCINMdkum6h64TPVhqlDpqulDQB/a68yABAgOU21duBLDdlm86oKP" crossorigin="anonymous"></script>
</body></html>
具体来说,它只检索旋转加载轮的 GIF,这(不用说)非常令人沮丧!
如何在 Javascript 完全执行后让 webdriver 仅 return 控制?
请注意,由于内容安全策略的实施不一致,我必须使用基于 Chromium 的网络驱动程序。
您可以让驱动程序等到找到文本'Last Modified'。
def get_results(url):
driver = None
try:
driver = get_html(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Last Modified')]")))
finally:
driver.quit()
我正在尝试从 BitMEX 中提取 CSV 列表。该页面执行一些(加载速度相当慢)Javascript 以呈现目录的实际索引(我不明白他们为什么选择这样做——也许是混淆?)。
我有以下 Python 3.x 代码,它使用了 Selenium 的 Python 语言绑定:
#!/bin/python3
import datetime
from urllib import request
import sys
from sys import argv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
DOM_LOAD_WAIT = 60
COMMENT_CHAR = '#'
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def get_html(url):
# configure headlessness for the webdriver
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(DOM_LOAD_WAIT)
# constantly retry until success
while True:
try:
driver.get(url)
break
except KeyboardInterrupt:
exit(1)
except:
eprint("Retrying \"{}\"...".format(url))
continue
return driver
def get_results(url):
driver = None
try:
driver = get_html(url)
element = WebDriverWait(driver, DOM_LOAD_WAIT).until(
EC.presence_of_element_located((By.TAG_NAME, "pre"))
)
finally:
driver.quit()
print(driver.find_elements_by_tag_name("a"))
if __name__ == "__main__":
url = "https://public.bitmex.com/?prefix=data/quote/"
get_results(url.strip())
print("", end=None, flush=True) # flush stdout!
问题是脚本检索 初始 页面源,而不是我之后的最终页面源(即,在 Javascript 完全执行之后-- 这可能需要一些时间):
$ ./script.py
<html><head>
<title>public.bitmex.com</title>
<meta http-equiv="Content-Security-Policy" content="default-src 'none'; img-src 'self'; connect-src https://s3-eu-west-1.amazonaws.com; script-src 'sha384-3ceskX3iaEnIogmQchP8opvBy3Mi7Ce34nWjpBIwVTHfGYWQS9jwHDVRnpKKHJg7' 'sha384-n0cKBy1+1+ACIC9J2XunFZItQjpIi1bilP1FCayDxybB40OcUY1ipK4Qjr856KWI' 'sha384-Rncjr7coAsbMCINMdkum6h64TPVhqlDpqulDQB/a68yABAgOU21duBLDdlm86oKP'; child-src 'none'; object-src 'none'; require-sri-for script style; block-all-mixed-content;">
</head>
<body>
<div id="navigation"></div>
<div id="listing"><img src="//public.bitmex.com/ajaxload-circle.gif"></div>
<script type="text/javascript" src="https://public.bitmex.com/jquery.min.js" integrity="sha384-3ceskX3iaEnIogmQchP8opvBy3Mi7Ce34nWjpBIwVTHfGYWQS9jwHDVRnpKKHJg7" crossorigin="anonymous"></script>
<script type="text/javascript" src="https://public.bitmex.com/init.js" integrity="sha384-n0cKBy1+1+ACIC9J2XunFZItQjpIi1bilP1FCayDxybB40OcUY1ipK4Qjr856KWI" crossorigin="anonymous"></script>
<script type="text/javascript" src="https://public.bitmex.com/list.js" integrity="sha384-Rncjr7coAsbMCINMdkum6h64TPVhqlDpqulDQB/a68yABAgOU21duBLDdlm86oKP" crossorigin="anonymous"></script>
</body></html>
具体来说,它只检索旋转加载轮的 GIF,这(不用说)非常令人沮丧!
如何在 Javascript 完全执行后让 webdriver 仅 return 控制?
请注意,由于内容安全策略的实施不一致,我必须使用基于 Chromium 的网络驱动程序。
您可以让驱动程序等到找到文本'Last Modified'。
def get_results(url):
driver = None
try:
driver = get_html(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Last Modified')]")))
finally:
driver.quit()