Python 使用多个浏览器执行脚本 Selenium

Python execute script using multiple browsers Selenium

如何使用多个浏览器执行以下脚本?

每个 n urls 都应该使用单独的浏览器执行。我应该能够定义 n 的值(并行抓取)

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver

browser = webdriver.Chrome()

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

def parse_data(url):
    while True:
        try:
            browser.get(url)
            df = pd.read_html(browser.page_source)[0]
            break
        except KeyError:
            browser.quit()
            continue
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data

# URLs go here
urls = {
    "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
    "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
    "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
    "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
    "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
    "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
    "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
    "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
    "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
    "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}

if __name__ == '__main__':

    results = None

    for url in urls:
        try:
            game_data = parse_data(url)
            if game_data is None:
                continue
            result = pd.DataFrame(game_data.__dict__)
            if results is None:
                results = result
            else:
                results = results.append(result, ignore_index=True)

print(results)

目前该脚本对所有 urls

使用一个浏览器 window

如何修改代码以针对每个 n urls 打开多个浏览器事件以更快地完成相同的工作,然后附加到 results.

我会采取以下方法:

  1. 生成一个新线程(此处信息:Spawning a thread in python
  2. 在每个线程
  3. 中使用browser = webdriver.Chrome()创建浏览器的新实例
  4. 照常进行

Chrome/Firefox 中使用 DevTool(选项卡:Network,过滤器:JS, XHR)我发现页面使用的 url 使用 [=17= 从服务器获取数据].

https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/

etc.

网址相似。不同之处在于 xbNfvuAMl8FEjeUE,我在代码中发现它是 PageTournament({"id":"l8FEjeUE", ...,我可以生成这些网址。

这样我就可以创建不使用 Selenium 而只使用 requests.

的代码来获取 HTML

需要原始代码 ~20s 并且 requests 只需要 ~6s.

顺便说一句:我还减少了 parse_data 中的代码,只使用 DataFrame 而没有 class GameData

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from multiprocessing import Pool

# --- functions ---

def get_html(url):
    r = requests.get(url, headers=headers)
    text = r.text
    start = text.find('PageTournament({"id":"') + len('PageTournament({"id":"')
    end = text.find('"', start)
    code = text[start:end]
    print(f'code: {code}')

    url = f'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/{code}/X0/1/0/1/'

    r = requests.get(url, headers=headers)
    text = r.text

    # remove `globals.jsonpCallback('...',` at the start
    text = text.split(',', 1)[1]
    text = text[:-2]              # remove `);` at the end

    # print('json:', text[:25], '...', text[-25:])  # may display partially because other processes my put own text
    print(f'json: {text[:25]} ... {text[-25:]}')  # display all in one peace

    data = json.loads(text)
    html = data['d']['html']

    # print('html:', html[:25], '...', html[-25:])  # may display partially because other processes my put own text
    # may display partially because other processes my put own text
    print(f'html: {html[:25]} ... {html[-25:]}')

    return html


def parse_data(html):
    try:
        df = pd.read_html(html)[0]
    except KeyError:
        print('KeyError')
        return

    soup = bs(html, "lxml")
    header = soup.select('table th.first2.tl a')

    if not header:
        return

    df['country'] = header[1].text
    df['league'] = header[2].text

    return df


def process(url):
    return parse_data(get_html(url))

# --- main ---

# needed headers - on some systems it has to be outside `__main__`

headers = {
    'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'referer': 'https://www.oddsportal.com/',
}

if __name__ == '__main__':

    # urls for AJAX requests
    # ajax_urls = {
    #    # for 'view-source:https://www.oddsportal.com/soccer/romania/superliga-women/results/#/'
    #    'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/',
    #    # for 'https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/'
    #    'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/',
    # }
    # you can find `l8FEjeUE` in oriiginal page as `PageTournament({"id":"l8FEjeUE", ...`

    urls = {
        "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
        "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
        "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
        "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
        "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
        "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
        "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
        "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
        "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
        "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
    }

    time_start = time.time()

    # empty `DataFrame` so I don't have to check `if results is None`
    results = pd.DataFrame()

    with Pool(10) as p:
        all_game_data = p.map(process, urls)

    for game_data in all_game_data:

        if game_data is None:
            #print('game_data', game_data)
            continue

        results = results.append(game_data, ignore_index=True)

    time_end = time.time()
    time_diff = (time_end - time_start)

    print(f'time: {time_diff:.2f} s')

    print('--- results ---')
    print(results)

编辑:

因为@αūɱҽudαмєяιcαη 发现 headers 必须在 __main__ 之外,因为在某些系统上它可能会引发错误。


文档:multiprocessing


编辑:

我创建了使用多重处理的代码 运行 原始代码。

问题是它无法发送 browser 给每个进程必须 运行 拥有 Selenium 的进程,并且它同时显示 5 个浏览器。而且它需要更多时间来启动所有浏览器,我花了 ~40s.

也许如果 运行 使用队列处理以获取 URL 并发回 HTML 那么它可以重用一个浏览器(或几个浏览器 运行 它们在同时)。但它需要更复杂的代码。

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from multiprocessing import Pool

# --- classes ---

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

# --- functions ---

def parse_data(url):
    browser = webdriver.Chrome()
    
    while True:
        try:
            browser.get(url)
            df = pd.read_html(browser.page_source)[0]
            break
        except KeyError:
            print('KeyError:', url)
            continue
            
    html = browser.page_source
    browser.quit()            

    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data

# --- main ---

if __name__ == '__main__':

    # URLs go here
    urls = {
        "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
        "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
        "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
        "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
        "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
        "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
        "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
        "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
        "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
        "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
    }


    time_start = time.time()
    
    results = None
    
    with Pool(5) as p:
        all_game_data = p.map(parse_data, urls)
        
    for game_data in all_game_data:
            
        if game_data is None:
            #print('game_data', game_data)
            continue
        
        result = pd.DataFrame(game_data.__dict__)
        
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

    time_end = time.time()
    time_diff = (time_end - time_start)
    print(f'time: {time_diff:.2f} s')
    
    print('--- results ---')
    print(results)    

这里是使用多线程池的代码,该池仅限于 MAX_BROWSERS 给出的一定数量的浏览器,其中一旦驱动程序启动,它就可以被未来提交的任务重用。

请注意,我已经删除了函数 parse_data 开头的 while True: 循环,因为坦率地说,我无法理解它的功能。当然,如果您觉得需要,您可以恢复它。不管你做什么,你不是想要调用browser.quit,但是。

在下面的例子中,我设置了MAX_BROWSERS = 3:

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        #options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def __del__(self):
        self.driver.quit() # clean up driver when we are cleaned up
        #print('The driver has been "quitted".')

threadLocal = threading.local()

def create_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

def parse_data(url):
    try:
        browser = create_driver()
        browser.get(url)
        df = pd.read_html(browser.page_source)[0]
    except KeyError:
        print('KeyError')
        return None
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data

# URLs go here
urls = {
    "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
    "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
    "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
    "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
    "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
    "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
    "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
    "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
    "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
    "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}

if __name__ == '__main__':
    results = None
    # To limit the number of browsers we will use
    # (set to a large number if you don't want a limit):
    MAX_BROWSERS = 3
    pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
    for game_data in pool.imap(parse_data, urls):
        if game_data is None:
            continue
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)

    print(results)
    # ensure all the drivers are "quitted":
    del threadLocal
    import gc
    gc.collect() # a little extra insurance

打印:


DevTools listening on ws://127.0.0.1:61928/devtools/browser/1874311c-a84e-4903-a5da-c64d93dd86cb

DevTools listening on ws://127.0.0.1:61929/devtools/browser/078e7a54-3a0d-43d5-a05e-04feae6242bc

DevTools listening on ws://127.0.0.1:61930/devtools/browser/241ba2b3-a1ab-4a41-8dec-82a051bdc4bc
0           None  16:06       Atl. Madrid - Juventus       2:1       160      +235       166         World  International Champions Cup
1    04 Aug 2019  14:06            Tottenham - Inter  1:2 pen.      -145      +295       363         World  International Champions Cup
2    03 Aug 2019  16:36    Manchester Utd - AC Milan  3:2 pen.      -128      +279       332         World  International Champions Cup
3    28 Jul 2019  19:06           AC Milan - Benfica       0:1       190      +252       131         World  International Champions Cup
4    27 Jul 2019  00:06    Real Madrid - Atl. Madrid       3:7       106      +259       233         World  International Champions Cup
..           ...    ...                          ...       ...       ...       ...       ...           ...                          ...
245  29 Jan 2021  17:30    Den Haag W - VV Alkmaar W       3:1      -312      +424       550   Netherlands         Eredivisie Cup Women
246  04 Dec 2020  17:30  Heerenveen W - VV Alkmaar W       3:1      -244      +373       450   Netherlands         Eredivisie Cup Women
247  04 Dec 2020  17:30    PEC Zwolle W - Den Haag W       3:0       173      +269       119   Netherlands         Eredivisie Cup Women
248  04 Dec 2020  17:30               PSV W - Ajax W       2:2       110      +256       193   Netherlands         Eredivisie Cup Women
249  04 Dec 2020  17:30       Twente W - Excelsior W       9:2     -1667      +867      1728   Netherlands         Eredivisie Cup Women

[250 rows x 9 columns]