Python 使用多个浏览器执行脚本 Selenium
Python execute script using multiple browsers Selenium
如何使用多个浏览器执行以下脚本?
每个 n
urls
都应该使用单独的浏览器执行。我应该能够定义 n
的值(并行抓取)
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
browser.quit()
continue
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
if __name__ == '__main__':
results = None
for url in urls:
try:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
目前该脚本对所有 urls
使用一个浏览器 window
如何修改代码以针对每个 n
urls
打开多个浏览器事件以更快地完成相同的工作,然后附加到 results
.
我会采取以下方法:
- 生成一个新线程(此处信息:Spawning a thread in python)
- 在每个线程
中使用browser = webdriver.Chrome()
创建浏览器的新实例
- 照常进行
在 Chrome/Firefox
中使用 DevTool
(选项卡:Network
,过滤器:JS, XHR
)我发现页面使用的 url 使用 [=17= 从服务器获取数据].
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/
etc.
网址相似。不同之处在于 xbNfvuAM
、l8FEjeUE
,我在代码中发现它是 PageTournament({"id":"l8FEjeUE", ...
,我可以生成这些网址。
这样我就可以创建不使用 Selenium
而只使用 requests
.
的代码来获取 HTML
需要原始代码 ~20s
并且 requests
只需要 ~6s
.
顺便说一句:我还减少了 parse_data
中的代码,只使用 DataFrame
而没有 class GameData
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from multiprocessing import Pool
# --- functions ---
def get_html(url):
r = requests.get(url, headers=headers)
text = r.text
start = text.find('PageTournament({"id":"') + len('PageTournament({"id":"')
end = text.find('"', start)
code = text[start:end]
print(f'code: {code}')
url = f'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/{code}/X0/1/0/1/'
r = requests.get(url, headers=headers)
text = r.text
# remove `globals.jsonpCallback('...',` at the start
text = text.split(',', 1)[1]
text = text[:-2] # remove `);` at the end
# print('json:', text[:25], '...', text[-25:]) # may display partially because other processes my put own text
print(f'json: {text[:25]} ... {text[-25:]}') # display all in one peace
data = json.loads(text)
html = data['d']['html']
# print('html:', html[:25], '...', html[-25:]) # may display partially because other processes my put own text
# may display partially because other processes my put own text
print(f'html: {html[:25]} ... {html[-25:]}')
return html
def parse_data(html):
try:
df = pd.read_html(html)[0]
except KeyError:
print('KeyError')
return
soup = bs(html, "lxml")
header = soup.select('table th.first2.tl a')
if not header:
return
df['country'] = header[1].text
df['league'] = header[2].text
return df
def process(url):
return parse_data(get_html(url))
# --- main ---
# needed headers - on some systems it has to be outside `__main__`
headers = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'referer': 'https://www.oddsportal.com/',
}
if __name__ == '__main__':
# urls for AJAX requests
# ajax_urls = {
# # for 'view-source:https://www.oddsportal.com/soccer/romania/superliga-women/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/',
# # for 'https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/',
# }
# you can find `l8FEjeUE` in oriiginal page as `PageTournament({"id":"l8FEjeUE", ...`
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
# empty `DataFrame` so I don't have to check `if results is None`
results = pd.DataFrame()
with Pool(10) as p:
all_game_data = p.map(process, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
results = results.append(game_data, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
编辑:
因为@αūɱҽudαмєяιcαη 发现 headers
必须在 __main__
之外,因为在某些系统上它可能会引发错误。
编辑:
我创建了使用多重处理的代码 运行 原始代码。
问题是它无法发送 browser
给每个进程必须 运行 拥有 Selenium
的进程,并且它同时显示 5 个浏览器。而且它需要更多时间来启动所有浏览器,我花了 ~40s
.
也许如果 运行 使用队列处理以获取 URL
并发回 HTML
那么它可以重用一个浏览器(或几个浏览器 运行 它们在同时)。但它需要更复杂的代码。
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from multiprocessing import Pool
# --- classes ---
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
# --- functions ---
def parse_data(url):
browser = webdriver.Chrome()
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
print('KeyError:', url)
continue
html = browser.page_source
browser.quit()
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# --- main ---
if __name__ == '__main__':
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
results = None
with Pool(5) as p:
all_game_data = p.map(parse_data, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
这里是使用多线程池的代码,该池仅限于 MAX_BROWSERS
给出的一定数量的浏览器,其中一旦驱动程序启动,它就可以被未来提交的任务重用。
请注意,我已经删除了函数 parse_data
开头的 while True:
循环,因为坦率地说,我无法理解它的功能。当然,如果您觉得需要,您可以恢复它。不管你做什么,你不是想要调用browser.quit
,但是。
在下面的例子中,我设置了MAX_BROWSERS = 3
:
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
#options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
#print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
try:
browser = create_driver()
browser.get(url)
df = pd.read_html(browser.page_source)[0]
except KeyError:
print('KeyError')
return None
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 3
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
打印:
DevTools listening on ws://127.0.0.1:61928/devtools/browser/1874311c-a84e-4903-a5da-c64d93dd86cb
DevTools listening on ws://127.0.0.1:61929/devtools/browser/078e7a54-3a0d-43d5-a05e-04feae6242bc
DevTools listening on ws://127.0.0.1:61930/devtools/browser/241ba2b3-a1ab-4a41-8dec-82a051bdc4bc
0 None 16:06 Atl. Madrid - Juventus 2:1 160 +235 166 World International Champions Cup
1 04 Aug 2019 14:06 Tottenham - Inter 1:2 pen. -145 +295 363 World International Champions Cup
2 03 Aug 2019 16:36 Manchester Utd - AC Milan 3:2 pen. -128 +279 332 World International Champions Cup
3 28 Jul 2019 19:06 AC Milan - Benfica 0:1 190 +252 131 World International Champions Cup
4 27 Jul 2019 00:06 Real Madrid - Atl. Madrid 3:7 106 +259 233 World International Champions Cup
.. ... ... ... ... ... ... ... ... ...
245 29 Jan 2021 17:30 Den Haag W - VV Alkmaar W 3:1 -312 +424 550 Netherlands Eredivisie Cup Women
246 04 Dec 2020 17:30 Heerenveen W - VV Alkmaar W 3:1 -244 +373 450 Netherlands Eredivisie Cup Women
247 04 Dec 2020 17:30 PEC Zwolle W - Den Haag W 3:0 173 +269 119 Netherlands Eredivisie Cup Women
248 04 Dec 2020 17:30 PSV W - Ajax W 2:2 110 +256 193 Netherlands Eredivisie Cup Women
249 04 Dec 2020 17:30 Twente W - Excelsior W 9:2 -1667 +867 1728 Netherlands Eredivisie Cup Women
[250 rows x 9 columns]
如何使用多个浏览器执行以下脚本?
每个 n
urls
都应该使用单独的浏览器执行。我应该能够定义 n
的值(并行抓取)
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
browser.quit()
continue
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
if __name__ == '__main__':
results = None
for url in urls:
try:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
目前该脚本对所有 urls
如何修改代码以针对每个 n
urls
打开多个浏览器事件以更快地完成相同的工作,然后附加到 results
.
我会采取以下方法:
- 生成一个新线程(此处信息:Spawning a thread in python)
- 在每个线程 中使用
- 照常进行
browser = webdriver.Chrome()
创建浏览器的新实例
在 Chrome/Firefox
中使用 DevTool
(选项卡:Network
,过滤器:JS, XHR
)我发现页面使用的 url 使用 [=17= 从服务器获取数据].
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/
https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/
etc.
网址相似。不同之处在于 xbNfvuAM
、l8FEjeUE
,我在代码中发现它是 PageTournament({"id":"l8FEjeUE", ...
,我可以生成这些网址。
这样我就可以创建不使用 Selenium
而只使用 requests
.
HTML
需要原始代码 ~20s
并且 requests
只需要 ~6s
.
顺便说一句:我还减少了 parse_data
中的代码,只使用 DataFrame
而没有 class GameData
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from multiprocessing import Pool
# --- functions ---
def get_html(url):
r = requests.get(url, headers=headers)
text = r.text
start = text.find('PageTournament({"id":"') + len('PageTournament({"id":"')
end = text.find('"', start)
code = text[start:end]
print(f'code: {code}')
url = f'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/{code}/X0/1/0/1/'
r = requests.get(url, headers=headers)
text = r.text
# remove `globals.jsonpCallback('...',` at the start
text = text.split(',', 1)[1]
text = text[:-2] # remove `);` at the end
# print('json:', text[:25], '...', text[-25:]) # may display partially because other processes my put own text
print(f'json: {text[:25]} ... {text[-25:]}') # display all in one peace
data = json.loads(text)
html = data['d']['html']
# print('html:', html[:25], '...', html[-25:]) # may display partially because other processes my put own text
# may display partially because other processes my put own text
print(f'html: {html[:25]} ... {html[-25:]}')
return html
def parse_data(html):
try:
df = pd.read_html(html)[0]
except KeyError:
print('KeyError')
return
soup = bs(html, "lxml")
header = soup.select('table th.first2.tl a')
if not header:
return
df['country'] = header[1].text
df['league'] = header[2].text
return df
def process(url):
return parse_data(get_html(url))
# --- main ---
# needed headers - on some systems it has to be outside `__main__`
headers = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'referer': 'https://www.oddsportal.com/',
}
if __name__ == '__main__':
# urls for AJAX requests
# ajax_urls = {
# # for 'view-source:https://www.oddsportal.com/soccer/romania/superliga-women/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/xbNfvuAM/X0/1/0/1/',
# # for 'https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/'
# 'https://fb.oddsportal.com/ajax-sport-country-tournament-archive/1/l8FEjeUE/X0/1/0/1/',
# }
# you can find `l8FEjeUE` in oriiginal page as `PageTournament({"id":"l8FEjeUE", ...`
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
# empty `DataFrame` so I don't have to check `if results is None`
results = pd.DataFrame()
with Pool(10) as p:
all_game_data = p.map(process, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
results = results.append(game_data, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
编辑:
因为@αūɱҽudαмєяιcαη 发现 headers
必须在 __main__
之外,因为在某些系统上它可能会引发错误。
编辑:
我创建了使用多重处理的代码 运行 原始代码。
问题是它无法发送 browser
给每个进程必须 运行 拥有 Selenium
的进程,并且它同时显示 5 个浏览器。而且它需要更多时间来启动所有浏览器,我花了 ~40s
.
也许如果 运行 使用队列处理以获取 URL
并发回 HTML
那么它可以重用一个浏览器(或几个浏览器 运行 它们在同时)。但它需要更复杂的代码。
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from multiprocessing import Pool
# --- classes ---
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
# --- functions ---
def parse_data(url):
browser = webdriver.Chrome()
while True:
try:
browser.get(url)
df = pd.read_html(browser.page_source)[0]
break
except KeyError:
print('KeyError:', url)
continue
html = browser.page_source
browser.quit()
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# --- main ---
if __name__ == '__main__':
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
time_start = time.time()
results = None
with Pool(5) as p:
all_game_data = p.map(parse_data, urls)
for game_data in all_game_data:
if game_data is None:
#print('game_data', game_data)
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
time_end = time.time()
time_diff = (time_end - time_start)
print(f'time: {time_diff:.2f} s')
print('--- results ---')
print(results)
这里是使用多线程池的代码,该池仅限于 MAX_BROWSERS
给出的一定数量的浏览器,其中一旦驱动程序启动,它就可以被未来提交的任务重用。
请注意,我已经删除了函数 parse_data
开头的 while True:
循环,因为坦率地说,我无法理解它的功能。当然,如果您觉得需要,您可以恢复它。不管你做什么,你不是想要调用browser.quit
,但是。
在下面的例子中,我设置了MAX_BROWSERS = 3
:
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
#options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
#print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
try:
browser = create_driver()
browser.get(url)
df = pd.read_html(browser.page_source)[0]
except KeyError:
print('KeyError')
return None
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
"https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
"https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
"https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
"https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
"https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
"https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
"https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
"https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
"https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 3
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
打印:
DevTools listening on ws://127.0.0.1:61928/devtools/browser/1874311c-a84e-4903-a5da-c64d93dd86cb
DevTools listening on ws://127.0.0.1:61929/devtools/browser/078e7a54-3a0d-43d5-a05e-04feae6242bc
DevTools listening on ws://127.0.0.1:61930/devtools/browser/241ba2b3-a1ab-4a41-8dec-82a051bdc4bc
0 None 16:06 Atl. Madrid - Juventus 2:1 160 +235 166 World International Champions Cup
1 04 Aug 2019 14:06 Tottenham - Inter 1:2 pen. -145 +295 363 World International Champions Cup
2 03 Aug 2019 16:36 Manchester Utd - AC Milan 3:2 pen. -128 +279 332 World International Champions Cup
3 28 Jul 2019 19:06 AC Milan - Benfica 0:1 190 +252 131 World International Champions Cup
4 27 Jul 2019 00:06 Real Madrid - Atl. Madrid 3:7 106 +259 233 World International Champions Cup
.. ... ... ... ... ... ... ... ... ...
245 29 Jan 2021 17:30 Den Haag W - VV Alkmaar W 3:1 -312 +424 550 Netherlands Eredivisie Cup Women
246 04 Dec 2020 17:30 Heerenveen W - VV Alkmaar W 3:1 -244 +373 450 Netherlands Eredivisie Cup Women
247 04 Dec 2020 17:30 PEC Zwolle W - Den Haag W 3:0 173 +269 119 Netherlands Eredivisie Cup Women
248 04 Dec 2020 17:30 PSV W - Ajax W 2:2 110 +256 193 Netherlands Eredivisie Cup Women
249 04 Dec 2020 17:30 Twente W - Excelsior W 9:2 -1667 +867 1728 Netherlands Eredivisie Cup Women
[250 rows x 9 columns]