通过 Pandas 的列表创建 table

Creating a table through a list for Pandas

我正在通过 Pandas 将我拥有的数据转换为数据帧。我觉得这远非一项艰巨的任务,但我似乎无法弄清楚。我有 headers 我想要的数据框,我有数据,但这是来自网络的数据。我知道我需要将它变成一个列表,然后将其放入 DataFrame 函数中,但我不知道如何将它放入列表中。

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd


PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)

driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")

number_of_players = 52
round_to_select = 3

for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
    if idx < number_of_players:
        down_arrow.click()
        time.sleep(.5)
    else:
        break

if round_to_select < 4:
    for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
        if idx < number_of_players:
            Select(menu).select_by_visible_text(f'Round {round_to_select}')
            time.sleep(.5)
        else:
            break

R1_page_source = driver.page_source

R1_soup = BeautifulSoup(R1_page_source, 'html.parser')

R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')

for R1_player in R1_leaderboard.find_all('tbody'):
    R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
    for R1_row in R1_rows:
        R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
        R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
        R1_Players = R1_row.find('a').text
        R1_Round = R1_row.find_all("select")[1].text
        R1_H1 = R1_row.find_all('span')[1].text
        R1_H2 = R1_row.find_all('span')[2].text
        R1_H3 = R1_row.find_all('span')[3].text
        R1_H4 = R1_row.find_all('span')[4].text
        R1_H5 = R1_row.find_all('span')[5].text
        R1_H6 = R1_row.find_all('span')[6].text
        R1_H7 = R1_row.find_all('span')[7].text
        R1_H8 = R1_row.find_all('span')[8].text
        R1_H9 = R1_row.find_all('span')[9].text
        R1_H10 = R1_row.find_all('span')[11].text
        R1_H11 = R1_row.find_all('span')[12].text
        R1_H12 = R1_row.find_all('span')[13].text
        R1_H13 = R1_row.find_all('span')[14].text
        R1_H14 = R1_row.find_all('span')[15].text
        R1_H15 = R1_row.find_all('span')[16].text
        R1_H16 = R1_row.find_all('span')[17].text
        R1_H17 = R1_row.find_all('span')[18].text
        R1_H18 = R1_row.find_all('span')[19].text
        print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)

这是您的代码的修改版本。我使用 pandas.read_html 将 html table 转换为数据框。

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd

# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")

number_of_players = 52
round_to_select = 3

# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
  if idx < number_of_players:
    try:
      down_arrow.click()
      time.sleep(.5)
    except:
      pass
  else:
      break

if round_to_select < 4:
  for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
    if idx < number_of_players:
      try:
        Select(menu).select_by_visible_text(f'Round {round_to_select}')
        time.sleep(.5)
      except:
        pass
    else:
        break

R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]

prettify 将 bs4 对象变成 pandas 可以处理的普通字符串。 read_html 实际上 returns 一个数据帧列表 - 但在这种情况下只有一个,这就是为什么我在最后添加 [0] 的原因。当我 运行 R1_df 我得到这个:

    Unnamed: 0  POS     PLAYER  SCORE   R1  R2  R3  R4  TOT     EARNINGS    FEDEX PTS
0   NaN     1   Scottie Scheffler   -10     69  67  71  71  278     ,700,000  600
1   NaN     2   Rory McIlroy    -7  73  73  71  64  281     ,620,000  330
2   NaN     T3  Shane Lowry     -5  73  68  73  69  283     0,000    180
3   NaN     T3  Cameron Smith   -5  68  74  68  73  283     0,000    180
4   NaN     5   Collin Morikawa     -4  73  70  74  67  284     0,000    120
...     ...     ...     ...     ...     ...     ...     ...     ...     ...     ...     ...
86  NaN     -   Stewart Hagestad (a)    CUT     79  81  --  --  160     --  0
87  NaN     -   José María Olazábal     CUT     77  84  --  --  161     --  0
88  NaN     -   Laird Shepherd (a)  CUT     81  85  --  --  166     --  0
89  NaN     -   Louis Oosthuizen    WD  76  --  --  --  76  --  0
90  NaN     -   Paul Casey  WD  --  --  --  --  --  --  0

希望这就是您要找的!因为 pandas 可以直接处理 html,所以实际上不需要制作任何中间列表。

如前所述,您可以让 Selenium 单击每个表格,然后使用 pandas' .read_html() 来解析表格。但是,有一个 espn api,如果有一个 api 可用,那么与使用 Selenium 相比,以这种方式获取数据要好得多(更健壮和高效)。您可以获得的数据也远多于网站上显示的数据。基本上只要你有锦标赛 number/id,你就可以输入它,从排行榜中获取玩家 ID,然后遍历这些:

import requests
import pandas as pd


tournamentId = '401353232'

url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/leaderboard'
payload = {
    'league': 'pga',
    'region': 'us',
    'lang': 'en',
    'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
tournament = jsonData['events'][0]['name']

courses_dict = {}
for event in jsonData['events']:
    for course in event['courses']:
        courses_dict[str(course['id'])] = course['name']

    
  

print(tournament)

payload = {
    'region': 'us',
    'lang': 'en',
    'season': '2022'}

results = pd.DataFrame()
for event in jsonData['events']:
    competitions = event['competitions']
    for competition in competitions:
        competitors = competition['competitors']
        for competitor in competitors:
            playerName = competitor['athlete']['displayName']
            playerId = competitor['athlete']['id']
            status = competitor['status']
            
            lastRound = status['period']
            lastStatus = status['type']['detail']
            
            url = f'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/{tournamentId}/competitorsummary/{playerId}'
            rounds = requests.get(url, params=payload).json()['rounds']
            
            for r in rounds:
                courseName = courses_dict[str(r['courseId'])]
                teeTime = r['teeTime']
                period = r['period']
                roundTotal = r['value']
                roundStatus = lastStatus
                
                if period < lastRound:
                    roundStatus = 'Finish'
                
                linescores = r['linescores']
                if len(linescores) == 0:
                    data = {'player':playerName,
                            'tournament':tournament,
                            'round':period}
                    temp_df = pd.DataFrame([data])
                else:
                    temp_df = pd.DataFrame(linescores)
                    temp_df['player'] = playerName
                    temp_df['tournament'] = tournament
                    temp_df['round'] = period
                
                
                    temp_df = temp_df.pivot(
                        index=['player','tournament','round'],
                        columns='period',
                        values='value').reset_index(drop=False)
                
                temp_df['course'] = courseName
                temp_df['teeTime'] = teeTime
                temp_df['status'] = roundStatus
                
                results = pd.concat([results, temp_df], axis=0).reset_index(drop=True)
            print('Collected: ', playerName)

输出:

print(results.head(10).to_string())
period             player          tournament                      course  round    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18
0       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      1  4.0  5.0  4.0  3.0  4.0  3.0  4.0  4.0  3.0  4.0  4.0  2.0  5.0  4.0  5.0  3.0  3.0  5.0
1       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      2  5.0  4.0  5.0  3.0  4.0  3.0  3.0  4.0  4.0  4.0  4.0  2.0  4.0  4.0  4.0  2.0  4.0  4.0
2       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      3  4.0  4.0  3.0  4.0  4.0  2.0  4.0  4.0  4.0  4.0  4.0  4.0  4.0  5.0  6.0  3.0  3.0  5.0
3       Scottie Scheffler  Masters Tournament  Augusta National Golf Club      4  4.0  5.0  3.0  3.0  4.0  3.0  3.0  5.0  4.0  5.0  4.0  3.0  5.0  3.0  4.0  3.0  4.0  6.0
4            Rory McIlroy  Masters Tournament  Augusta National Golf Club      1  4.0  4.0  4.0  3.0  4.0  4.0  4.0  5.0  4.0  4.0  3.0  3.0  5.0  5.0  5.0  4.0  4.0  4.0
5            Rory McIlroy  Masters Tournament  Augusta National Golf Club      2  4.0  4.0  4.0  3.0  5.0  3.0  4.0  5.0  4.0  5.0  6.0  3.0  4.0  4.0  5.0  2.0  4.0  4.0
6            Rory McIlroy  Masters Tournament  Augusta National Golf Club      3  5.0  5.0  4.0  2.0  4.0  4.0  3.0  5.0  4.0  4.0  4.0  4.0  4.0  4.0  4.0  3.0  4.0  4.0
7            Rory McIlroy  Masters Tournament  Augusta National Golf Club      4  3.0  5.0  3.0  3.0  4.0  3.0  3.0  4.0  4.0  3.0  4.0  3.0  3.0  4.0  5.0  3.0  4.0  3.0
8             Shane Lowry  Masters Tournament  Augusta National Golf Club      1  4.0  5.0  5.0  3.0  4.0  3.0  4.0  5.0  4.0  5.0  4.0  3.0  3.0  3.0  7.0  3.0  4.0  4.0
9             Shane Lowry  Masters Tournament  Augusta National Golf Club      2  5.0  4.0  4.0  3.0  4.0  3.0  3.0  5.0  4.0  3.0  4.0  3.0  4.0  4.0  4.0  3.0  4.0  4.0
....