Python 使用 Selenium 进行网页抓取 - 遍历 href link
Python web scraping using Selenium - iterate through href link
我正在尝试编写一个脚本,使用 selenium 下载许多包含不同 NHL 球员信息的文件;游戏日志。我想为以下table中的每个玩家下载一个文件:https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single
进入该网站后,我想点击 table 中所有玩家的名字。当通过 href link 单击玩家的名字时,将打开一个新的 window。顶部有几个下拉菜单。我想 select "Rate" 而不是 "Counts" 并且 select “游戏日志”而不是 "Player Summary",然后单击 "Submit"。最后,我想点击底部的 CSV(All) 下载一个 CSV 文件。
这是我当前的代码:
from selenium import webdriver
import csv
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
table = driver.find_element_by_xpath("//table[@class='indreg dataTable no-footer DTFC_Cloned']")
for row in table.find_elements_by_xpath("//tr[@role='row']")
links = driver.find_element_by_xpath('//a[@href]')
links.click()
select = Select(driver.find_element_by_name('rate'))
select.select_by_value("y")
select1 = Select(driver.find_element_by_name('v'))
select1.select_by_value("g")
select2 = Select(driver.find_element_by_type('submit'))
select2.select_by_value("submit")
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//div[@class="dt-button button-csv button-htm15"]')))
CSVall = driver.find_element_by_xpath('//div[@class="dt-button button-csv button-htm15"]')
CSVall.click()
driver.close()
我尝试更改不同的内容,但总是出错。问题出在哪里 ?
此外,我想我应该添加一行来等待网站加载,因为它需要几秒钟; "driver.get" 之后。我不知道在这种情况下结束等待的预期条件应该是什么。
谢谢
你不需要点击每个播放器link而是将 URLs 保存为列表,也有几个错误,你可以看到下面的工作代码
from selenium import webdriver
import csv
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
playerLinks = driver.find_elements_by_xpath("//table[@class='indreg dataTable no-footer DTFC_Cloned']//a")
playerLinks = [p.get_attribute('href') for p in playerLinks]
print(len(playerLinks))
for url in playerLinks:
driver.get(url)
select = Select(driver.find_element_by_name('rate'))
select.select_by_value("y")
select1 = Select(driver.find_element_by_name('v'))
select1.select_by_value("g")
driver.find_element_by_css_selector('input[type="submit"]').click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//a[@class="dt-button buttons-csv buttons-html5"][2]')))
CSVall = driver.find_element_by_xpath('//a[@class="dt-button buttons-csv buttons-html5"][2]')
CSVall.click()
driver.close()
您可以从第一页获取 playerId 并将它们连同表示 Rate 和 Game Log 选择的字符串连接到新 URL 的 queryString 部分,而不是继续点击选择。当然,您可以整理以下内容。
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def getPlayerId(url):
id = url.split('playerid=')[1]
id = id.split('&')[0]
return id
def makeNewURL(playerId):
return 'https://www.naturalstattrick.com/playerreport.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&stdoi=oi&rate=y&v=g&playerid=' + playerId
#chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome()
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
links = driver.find_elements_by_css_selector('table.indreg.dataTable.no-footer.DTFC_Cloned [href*=playerid]')
newLinks = []
for link in links:
newLinks.append(link.get_attribute('href'))
for link in newLinks:
playerId = getPlayerId(link)
link = makeNewURL(playerId)
driver.get(link)
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//a[@class="dt-button buttons-csv buttons-html5"][2]')))
CSVall = driver.find_element_by_xpath('//a[@class="dt-button buttons-csv buttons-html5"][2]')
CSVall.click()
我正在尝试编写一个脚本,使用 selenium 下载许多包含不同 NHL 球员信息的文件;游戏日志。我想为以下table中的每个玩家下载一个文件:https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single
进入该网站后,我想点击 table 中所有玩家的名字。当通过 href link 单击玩家的名字时,将打开一个新的 window。顶部有几个下拉菜单。我想 select "Rate" 而不是 "Counts" 并且 select “游戏日志”而不是 "Player Summary",然后单击 "Submit"。最后,我想点击底部的 CSV(All) 下载一个 CSV 文件。
这是我当前的代码:
from selenium import webdriver
import csv
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
table = driver.find_element_by_xpath("//table[@class='indreg dataTable no-footer DTFC_Cloned']")
for row in table.find_elements_by_xpath("//tr[@role='row']")
links = driver.find_element_by_xpath('//a[@href]')
links.click()
select = Select(driver.find_element_by_name('rate'))
select.select_by_value("y")
select1 = Select(driver.find_element_by_name('v'))
select1.select_by_value("g")
select2 = Select(driver.find_element_by_type('submit'))
select2.select_by_value("submit")
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//div[@class="dt-button button-csv button-htm15"]')))
CSVall = driver.find_element_by_xpath('//div[@class="dt-button button-csv button-htm15"]')
CSVall.click()
driver.close()
我尝试更改不同的内容,但总是出错。问题出在哪里 ?
此外,我想我应该添加一行来等待网站加载,因为它需要几秒钟; "driver.get" 之后。我不知道在这种情况下结束等待的预期条件应该是什么。
谢谢
你不需要点击每个播放器link而是将 URLs 保存为列表,也有几个错误,你可以看到下面的工作代码
from selenium import webdriver
import csv
from selenium.webdriver.support.ui import Select
from datetime import date, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
playerLinks = driver.find_elements_by_xpath("//table[@class='indreg dataTable no-footer DTFC_Cloned']//a")
playerLinks = [p.get_attribute('href') for p in playerLinks]
print(len(playerLinks))
for url in playerLinks:
driver.get(url)
select = Select(driver.find_element_by_name('rate'))
select.select_by_value("y")
select1 = Select(driver.find_element_by_name('v'))
select1.select_by_value("g")
driver.find_element_by_css_selector('input[type="submit"]').click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//a[@class="dt-button buttons-csv buttons-html5"][2]')))
CSVall = driver.find_element_by_xpath('//a[@class="dt-button buttons-csv buttons-html5"][2]')
CSVall.click()
driver.close()
您可以从第一页获取 playerId 并将它们连同表示 Rate 和 Game Log 选择的字符串连接到新 URL 的 queryString 部分,而不是继续点击选择。当然,您可以整理以下内容。
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def getPlayerId(url):
id = url.split('playerid=')[1]
id = id.split('&')[0]
return id
def makeNewURL(playerId):
return 'https://www.naturalstattrick.com/playerreport.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&stdoi=oi&rate=y&v=g&playerid=' + playerId
#chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome()
driver.get("https://www.naturalstattrick.com/playerteams.php?fromseason=20142015&thruseason=20162017&stype=2&sit=all&score=all&stdoi=std&rate=y&team=ALL&pos=S&loc=B&toi=0.1&gpfilt=none&fd=&td=&tgp=410&lines=single")
links = driver.find_elements_by_css_selector('table.indreg.dataTable.no-footer.DTFC_Cloned [href*=playerid]')
newLinks = []
for link in links:
newLinks.append(link.get_attribute('href'))
for link in newLinks:
playerId = getPlayerId(link)
link = makeNewURL(playerId)
driver.get(link)
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//a[@class="dt-button buttons-csv buttons-html5"][2]')))
CSVall = driver.find_element_by_xpath('//a[@class="dt-button buttons-csv buttons-html5"][2]')
CSVall.click()