当 url 不变时,使用 Selenium 在多个页面上抓取 table

Using Selenium to scrape a table across multiple pages when the url doesn't change

我一直在尝试编写一个程序来从 www.whoscored.com 中抓取统计信息并创建一个 pandas 数据框。

我已经在 crookedleaf 的帮助下更新了代码,这是工作代码:

import time
import pandas as pd
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver

driver = webdriver.Firefox()
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')

summary_stats = DataFrame()

while True:

    while driver.find_element_by_xpath('//*[@id="statistics-table-summary"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[@id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
        time.sleep(1)

    table = driver.find_element_by_xpath('//*[@id="statistics-table-summary"]')
    table_html = table.get_attribute('innerHTML')
    page_number = driver.find_element_by_xpath('//*[@id="currentPage"]').get_attribute('value')
    print('Page ' + page_number)
    df1 = read_html(table_html)[0]
    summary_stats = pd.concat([summary_stats, df1])
    next_link = driver.find_element_by_xpath('//*[@id="next"]')

    if 'disabled' in next_link.get_attribute('class'):
        break

    next_link.click()

print(summary_stats)

driver.close()

现在我正在尝试从其他选项卡收集统计信息。我真的很接近,但是代码在应该跳出循环时没有退出循环。这是下面的代码:

defensive_button = driver.find_element_by_xpath('//*[@id="stage-top-player-stats-options"]/li[2]/a')
defensive_button.click()

defensive_stats = DataFrame()

while True:

    while driver.find_element_by_xpath('//*[@id="statistics-table-defensive"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[@id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
        time.sleep(1)

    table = driver.find_element_by_xpath('//*[@id="statistics-table-defensive"]')
    table_html = table.get_attribute('innerHTML')
    page_number = driver.find_element_by_xpath('//*[@id="statistics-paging-defensive"]/div/input[1]').get_attribute('value')
    print('Page ' + page_number)
    df2 = read_html(table_html)[0]
    defensive_stats = pd.concat([defensive_stats, df2])
    next_link = driver.find_element_by_xpath('//*[@id="statistics-paging-defensive"]/div/dl[2]/dd[3]')

    if 'disabled' in next_link.get_attribute('class'):
        break

    next_link.click()

print(defensive_stats)

此代码循环遍历所有页面,但随后一直循环遍历最后一页

您正在循环外定义 table 的代码。您正在导航到下一页,但并未重新定义 tabletable_html 元素。将它们移动到 while True

之后的第一行

编辑:对您的代码进行更改后,我猜是由于 table 的动态加载内容,您无法处理更改或无法获取内容,因为 "loading"图形叠加。另一件事是可能并不总是 30 页。例如今天有29,所以它不断从第29页获取数据。我修改你的代码保持运行ning直到"next"按钮不再启用,我等待在继续之前检查 table 是否正在加载:

import time
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver

driver = webdriver.Chrome(path-to-your-chromedriver)
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')

df = DataFrame()

while True:

    while driver.find_element_by_xpath('//*[@id="statistics-table-summary"]').get_attribute('class') == 'is-updating': # driver.find_element_by_xpath('//*[@id="statistics-table-summary-loading"]').get_attribute('style') == 'display; block;' or
        time.sleep(1)

    table = driver.find_element_by_xpath('//*[@id="statistics-table-summary"]')
    table_html = table.get_attribute('innerHTML')
    page_number = driver.find_element_by_xpath('//*[@id="currentPage"]').get_attribute('value')
    print('Page ' + page_number)
    df1 = read_html(table_html)[0]
    df.append(df1)
    next_link = driver.find_element_by_xpath('//*[@id="next"]')

    if 'disabled' in next_link.get_attribute('class'):
        break

    next_link.click()


print(df)

driver.close()

但是,我在 运行 结束时得到一个空的 DataFrame。不幸的是,我对 pandas 不够熟悉,无法确定问题所在,但它与 df.append() 有关。我 运行 通过它在每个循环中打印 df1 的值,并打印正确的数据,但是它不会将它添加到 DataFrame。这可能是您足够熟悉的东西,可以完全实施 运行 所需的更改。

编辑 2:我花了一段时间才弄明白这一点。本质上,页面的内容是使用 javascript 动态加载的。您声明的 'next' 元素仍然是您遇到的第一个 'next' 按钮。每次单击新选项卡时,'next' 元素的数量都会增加。我已经添加了一个编辑,可以成功地浏览所有选项卡('detailed' 选项卡除外……希望你不需要这个,哈哈)。然而,我仍然变得空虚 DataFrame()'s

import time
import pandas as pd
from pandas.io.html import read_html
from pandas import DataFrame
from selenium import webdriver

driver = webdriver.Chrome('/home/mdrouin/Downloads/chromedriver')
driver.get('https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/PlayerStatistics/England-Premier-League-2016-2017')

statistics = {  # this is a list of all the tabs on the page
    'summary': DataFrame(),
    'defensive': DataFrame(),
    'offensive': DataFrame(),
    'passing': DataFrame()
}

count = 0
tabs = driver.find_element_by_xpath('//*[@id="stage-top-player-stats-options"]').find_elements_by_tag_name('li')  # this pulls all the tab elements
for tab in tabs[:-1]:  # iterate over the different tab sections
    section = tab.text.lower()
    driver.find_element_by_xpath('//*[@id="stage-top-player-stats-options"]').find_element_by_link_text(section.title()).click()  # clicks the actual tab by using the dictionary's key (.proper() makes the first character in the string uppercase)
    time.sleep(3)
    while True:
        while driver.find_element_by_xpath('//*[@id="statistics-table-%s"]' % section).get_attribute('class') == 'is-updating':  # string formatting on the xpath to change for each section that is iterated over
            time.sleep(1)

        table = driver.find_element_by_xpath('//*[@id="statistics-table-%s"]' % section)  # string formatting on the xpath to change for each section that is iterated over
        table_html = table.get_attribute('innerHTML')
        df = read_html(table_html)[0]
        # print df
        pd.concat([statistics[section], df])
        next_link = driver.find_elements_by_xpath('//*[@id="next"]')[count]  # makes sure it's selecting the correct index of 'next' items 
        if 'disabled' in next_link.get_attribute('class'):
            break
        time.sleep(5)
        next_link.click()
    count += 1


for df in statistics.values():  # iterates over the DataFrame() elemnts
    print df

driver.quit()