使用 Selenium 和 Python 从足球网站抓取一些数据
Scrapping some data from a football website using Selenium and Python
我正在尝试制作一个使用 Selenium 提取一些数据的 Python 程序,首先我必须关闭两个警报,然后单击“显示所有匹配项”按钮,最后我需要单击每个“ stats" 按钮(有多个并且它们都具有相同的 class 名称)以从此 table.
中提取特定行
我需要为每个游戏提取以蓝色突出显示的 4 个值
我已经完成了前两个步骤,但现在我陷入了最后一个步骤,我必须单击每个“统计”按钮,从每个 table 中提取 4 个值,然后关闭 window 并进入下一场比赛。
这是我的代码
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
s=Service("C:/Users/dhias/OneDrive/Bureau/stgg/chromedriver.exe")
driver=webdriver.Chrome(service=s)
driver.get("https://www.soccerstats.com/matches.asp?matchday=1#")
driver.maximize_window()
time.sleep(1)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[mode='primary']"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID,"steady-floating-button"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Show all matches']"))).click()
我尝试点击每个具有相同 class 名称的“统计”按钮,但它不起作用
for element in driver.find_elements(By.XPATH,"//a[@class='myButton' and text()='stats']"):
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//a[@class='myButton' and text()='stats']"))).click()
link 到网站:soccerstats website
将链接保存到数组中,然后点击,因为点击后您不再位于包含链接的页面上
stat_links = []
#get all urls
for element in driver.find_elements(By.XPATH, "//a[@class='myButton' and text()='stats']"):
stat_links.append(element.get_attribute('href'))
for link in stat_links:
driver.get(link)
# do your stuff
你确定你需要在这里使用 Selenium 吗?您可以使用 pandas 和 requests.
轻松提取这些表
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.soccerstats.com/matches.asp?matchday=1#'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', text='stats')
filtered_links = []
for link in links:
if 'pmatch' in link['href']:
filtered_links.append(link['href'])
tables = {}
for count, link in enumerate(filtered_links, start=1):
try:
html = requests.get('https://www.soccerstats.com/' + link, headers=headers).text
soup = BeautifulSoup(html, 'html.parser')
goalsTable = soup.find('h2', text='Goal statistics')
teams = goalsTable.find_next('table')
teamsStr = teams.find_all('td')[0].text + ' ' + teams.find_all('td')[-1].text
goalsTable = teams.find_next('table')
df = pd.read_html(str(goalsTable))[0]
print(f'{count} of {len(filtered_links)}: {teamsStr}')
tables[teamsStr] = df
except Exception as e:
print(e)
print(f'{count} of {len(filtered_links)}: {teamsStr} !! NO GOALS STATISTICS !!')
输出:
我正在尝试制作一个使用 Selenium 提取一些数据的 Python 程序,首先我必须关闭两个警报,然后单击“显示所有匹配项”按钮,最后我需要单击每个“ stats" 按钮(有多个并且它们都具有相同的 class 名称)以从此 table.
中提取特定行我需要为每个游戏提取以蓝色突出显示的 4 个值
我已经完成了前两个步骤,但现在我陷入了最后一个步骤,我必须单击每个“统计”按钮,从每个 table 中提取 4 个值,然后关闭 window 并进入下一场比赛。
这是我的代码
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
s=Service("C:/Users/dhias/OneDrive/Bureau/stgg/chromedriver.exe")
driver=webdriver.Chrome(service=s)
driver.get("https://www.soccerstats.com/matches.asp?matchday=1#")
driver.maximize_window()
time.sleep(1)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[mode='primary']"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID,"steady-floating-button"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Show all matches']"))).click()
我尝试点击每个具有相同 class 名称的“统计”按钮,但它不起作用
for element in driver.find_elements(By.XPATH,"//a[@class='myButton' and text()='stats']"):
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//a[@class='myButton' and text()='stats']"))).click()
link 到网站:soccerstats website
将链接保存到数组中,然后点击,因为点击后您不再位于包含链接的页面上
stat_links = []
#get all urls
for element in driver.find_elements(By.XPATH, "//a[@class='myButton' and text()='stats']"):
stat_links.append(element.get_attribute('href'))
for link in stat_links:
driver.get(link)
# do your stuff
你确定你需要在这里使用 Selenium 吗?您可以使用 pandas 和 requests.
轻松提取这些表import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.soccerstats.com/matches.asp?matchday=1#'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', text='stats')
filtered_links = []
for link in links:
if 'pmatch' in link['href']:
filtered_links.append(link['href'])
tables = {}
for count, link in enumerate(filtered_links, start=1):
try:
html = requests.get('https://www.soccerstats.com/' + link, headers=headers).text
soup = BeautifulSoup(html, 'html.parser')
goalsTable = soup.find('h2', text='Goal statistics')
teams = goalsTable.find_next('table')
teamsStr = teams.find_all('td')[0].text + ' ' + teams.find_all('td')[-1].text
goalsTable = teams.find_next('table')
df = pd.read_html(str(goalsTable))[0]
print(f'{count} of {len(filtered_links)}: {teamsStr}')
tables[teamsStr] = df
except Exception as e:
print(e)
print(f'{count} of {len(filtered_links)}: {teamsStr} !! NO GOALS STATISTICS !!')
输出: