使用 url 范围时如何在 Selenium Python 中添加多线程?
How to add multithreading in Selenium Python when using url range?
所以我的想法是,如果我添加一些可以将 url-范围拆分为 5 的东西,然后为 5 个 chromedriver 实例中的每一个实例提供它们自己的 url-范围拆分来处理它会使抓取速度更快。这是我最大的问题。但是,如果每个 chromedriver 都有自己的 csv 文件,也许会更好,或者我需要添加一些东西,将所有的抓取集中在一个文件中?我真的在这里不知所措,我已经在提高我的技能水平。我永远感激至少在如何让多线程工作方面的任何具体帮助。谢谢!
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
header_added = False
time.sleep(3)
for i in range(1,153512):
print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
try:
Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
except:
Name =''
csvWriter.writerow([i, Name])
print(Name)
试试这个:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
header_added = False
time.sleep(3)
def init_driver_worker(_range_task): #create new instace of chrome then make it do its job
##### init driver
options = webdriver.ChromeOptions()
#you can't run multible instances of chrome
# with the same profile being used,
# so either create new profile for each instance or use incognito mode
options.add_argument("--incognito")
options.add_argument("--headless") #use headless browser (no GUI) to be faster
driver = webdriver.Chrome(options=options)
##### do the task
for i in _range_task:
print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
try:
Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
except:
Name =''
csvWriter.writerow([i, Name])
print(Name)
exit() #close the thread
def split_range(_range, parts): #split a range to chunks
chunk_size = int(len(_range)/parts)
chunks = [_range[x:x+chunk_size] for x in range(0, len(_range), chunk_size)]
return chunks
my_range = range(1,153512)
chunks = split_range(my_range, 10) # split the task to 10 instances of chrome
from threading import Thread
thread_workers = []
for chunk in chunks:
t = Thread(target=init_driver_worker, args=([chunk]))
thread_workers.append(t)
t.start()
# wait for the thread_workers to finish
for t in thread_workers:
t.join()
所以我的想法是,如果我添加一些可以将 url-范围拆分为 5 的东西,然后为 5 个 chromedriver 实例中的每一个实例提供它们自己的 url-范围拆分来处理它会使抓取速度更快。这是我最大的问题。但是,如果每个 chromedriver 都有自己的 csv 文件,也许会更好,或者我需要添加一些东西,将所有的抓取集中在一个文件中?我真的在这里不知所措,我已经在提高我的技能水平。我永远感激至少在如何让多线程工作方面的任何具体帮助。谢谢!
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
header_added = False
time.sleep(3)
for i in range(1,153512):
print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
try:
Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
except:
Name =''
csvWriter.writerow([i, Name])
print(Name)
试试这个:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
path_to_file = "test1.csv"
csvFile = open(path_to_file, 'a', encoding="utf-8", newline='')
csvWriter = csv.writer(csvFile)
header_added = False
time.sleep(3)
def init_driver_worker(_range_task): #create new instace of chrome then make it do its job
##### init driver
options = webdriver.ChromeOptions()
#you can't run multible instances of chrome
# with the same profile being used,
# so either create new profile for each instance or use incognito mode
options.add_argument("--incognito")
options.add_argument("--headless") #use headless browser (no GUI) to be faster
driver = webdriver.Chrome(options=options)
##### do the task
for i in _range_task:
print(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
driver.get(f"https://www.ancestry.com/discoveryui-content/view/{i}:61965")
try:
Name = driver.find_element(By.XPATH,"//table[@id='recordServiceData']//tr[contains(.,'Name:')]").text.replace("Name:", "")
except:
Name =''
csvWriter.writerow([i, Name])
print(Name)
exit() #close the thread
def split_range(_range, parts): #split a range to chunks
chunk_size = int(len(_range)/parts)
chunks = [_range[x:x+chunk_size] for x in range(0, len(_range), chunk_size)]
return chunks
my_range = range(1,153512)
chunks = split_range(my_range, 10) # split the task to 10 instances of chrome
from threading import Thread
thread_workers = []
for chunk in chunks:
t = Thread(target=init_driver_worker, args=([chunk]))
thread_workers.append(t)
t.start()
# wait for the thread_workers to finish
for t in thread_workers:
t.join()