如何在 Python 中使用无头模式和 selenium webdriver 节省抓取数据的时间
How to save time in scraping datas with headless mode and selenium webdriver in Python
您好,我有一个简单的 python 脚本,它可以自动打开网页并从中提取数据。
完成它需要 5 秒钟。在我的例子中,我想要一个更快的脚本,它 运行 瞬间或最多 2 秒。
这是脚本:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
options = Options()
options.headless = True
options.add_argument("window-size=1400,800")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
url = 'https://www.coteur.com/match/cotes-barcelone-huesca-rid1163090.html'
driver = webdriver.Chrome(options=options)
driver.get(url)
odds = [my_elem.text for my_elem in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, '//button[contains(@class, "btn btn-default btn-xs btncote")]')))]
columns = 3
rows = int(len(odds)/columns)
odds = [float(i) for i in odds]
odds = np.array(odds)
odds = odds.reshape(rows, columns)
print(odds, '\n')
driver.close()
driver.quit()
也许你可以帮助改进这个小脚本以节省一些宝贵的时间。
谢谢
这是执行的输出:
[[ 1.18 8.25 17. ]
[ 1.18 8.25 17. ]
[ 1.18 8.1 17. ]
[ 1.14 8. 17. ]
[ 1.16 8.75 18. ]
[ 1.2 7.25 10. ]
[ 1.14 7.75 16. ]
[ 1.17 8. 16. ]
[ 1.16 8.8 19. ]
[ 1.16 7. 12. ]
[ 1.13 8.5 18.5 ]]
real 0m4,978s
user 0m1,342s
sys 0m0,573s
运行 需要 5 秒。我的目标是减少执行时间
您的执行时间可能取决于几个因素:
- 你的机器 运行 代码
- 您的连接带宽
- 您请求的数据量
话虽如此,我已经使用了您的代码并获得了 2.31
秒的执行时间。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
options = Options()
options.headless = True
options.add_argument("window-size=1400,800")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
t0 = time.monotonic()
driver = webdriver.Chrome(options=options)
driver.get('https://www.coteur.com/match/cotes-barcelone-huesca-rid1163090.html')
elements = WebDriverWait(
driver,
2,
).until(
EC.visibility_of_all_elements_located(
(By.XPATH, '//button[contains(@class, "btn btn-default btn-xs btncote")]')
)
)
odds = np.array([float(my_elem.text) for my_elem in elements])
odds = odds.reshape(int(len(odds) / 3), 3)
print(odds)
t1 = time.monotonic()
print(f"{t1-t0:.2f}")
您好,我有一个简单的 python 脚本,它可以自动打开网页并从中提取数据。 完成它需要 5 秒钟。在我的例子中,我想要一个更快的脚本,它 运行 瞬间或最多 2 秒。
这是脚本:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
options = Options()
options.headless = True
options.add_argument("window-size=1400,800")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
url = 'https://www.coteur.com/match/cotes-barcelone-huesca-rid1163090.html'
driver = webdriver.Chrome(options=options)
driver.get(url)
odds = [my_elem.text for my_elem in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, '//button[contains(@class, "btn btn-default btn-xs btncote")]')))]
columns = 3
rows = int(len(odds)/columns)
odds = [float(i) for i in odds]
odds = np.array(odds)
odds = odds.reshape(rows, columns)
print(odds, '\n')
driver.close()
driver.quit()
也许你可以帮助改进这个小脚本以节省一些宝贵的时间。 谢谢
这是执行的输出:
[[ 1.18 8.25 17. ]
[ 1.18 8.25 17. ]
[ 1.18 8.1 17. ]
[ 1.14 8. 17. ]
[ 1.16 8.75 18. ]
[ 1.2 7.25 10. ]
[ 1.14 7.75 16. ]
[ 1.17 8. 16. ]
[ 1.16 8.8 19. ]
[ 1.16 7. 12. ]
[ 1.13 8.5 18.5 ]]
real 0m4,978s
user 0m1,342s
sys 0m0,573s
运行 需要 5 秒。我的目标是减少执行时间
您的执行时间可能取决于几个因素:
- 你的机器 运行 代码
- 您的连接带宽
- 您请求的数据量
话虽如此,我已经使用了您的代码并获得了 2.31
秒的执行时间。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
options = Options()
options.headless = True
options.add_argument("window-size=1400,800")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
t0 = time.monotonic()
driver = webdriver.Chrome(options=options)
driver.get('https://www.coteur.com/match/cotes-barcelone-huesca-rid1163090.html')
elements = WebDriverWait(
driver,
2,
).until(
EC.visibility_of_all_elements_located(
(By.XPATH, '//button[contains(@class, "btn btn-default btn-xs btncote")]')
)
)
odds = np.array([float(my_elem.text) for my_elem in elements])
odds = odds.reshape(int(len(odds) / 3), 3)
print(odds)
t1 = time.monotonic()
print(f"{t1-t0:.2f}")