通过 Pandas 的列表创建 table
Creating a table through a list for Pandas
我正在通过 Pandas 将我拥有的数据转换为数据帧。我觉得这远非一项艰巨的任务,但我似乎无法弄清楚。我有 headers 我想要的数据框,我有数据,但这是来自网络的数据。我知道我需要将它变成一个列表,然后将其放入 DataFrame 函数中,但我不知道如何将它放入列表中。
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
down_arrow.click()
time.sleep(.5)
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
for R1_player in R1_leaderboard.find_all('tbody'):
R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
for R1_row in R1_rows:
R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
R1_Players = R1_row.find('a').text
R1_Round = R1_row.find_all("select")[1].text
R1_H1 = R1_row.find_all('span')[1].text
R1_H2 = R1_row.find_all('span')[2].text
R1_H3 = R1_row.find_all('span')[3].text
R1_H4 = R1_row.find_all('span')[4].text
R1_H5 = R1_row.find_all('span')[5].text
R1_H6 = R1_row.find_all('span')[6].text
R1_H7 = R1_row.find_all('span')[7].text
R1_H8 = R1_row.find_all('span')[8].text
R1_H9 = R1_row.find_all('span')[9].text
R1_H10 = R1_row.find_all('span')[11].text
R1_H11 = R1_row.find_all('span')[12].text
R1_H12 = R1_row.find_all('span')[13].text
R1_H13 = R1_row.find_all('span')[14].text
R1_H14 = R1_row.find_all('span')[15].text
R1_H15 = R1_row.find_all('span')[16].text
R1_H16 = R1_row.find_all('span')[17].text
R1_H17 = R1_row.find_all('span')[18].text
R1_H18 = R1_row.find_all('span')[19].text
print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)
这是您的代码的修改版本。我使用 pandas.read_html
将 html table 转换为数据框。
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
try:
down_arrow.click()
time.sleep(.5)
except:
pass
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
try:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
except:
pass
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]
prettify
将 bs4 对象变成 pandas 可以处理的普通字符串。 read_html
实际上 returns 一个数据帧列表 - 但在这种情况下只有一个,这就是为什么我在最后添加 [0] 的原因。当我 运行 R1_df 我得到这个:
Unnamed: 0 POS PLAYER SCORE R1 R2 R3 R4 TOT EARNINGS FEDEX PTS
0 NaN 1 Scottie Scheffler -10 69 67 71 71 278 ,700,000 600
1 NaN 2 Rory McIlroy -7 73 73 71 64 281 ,620,000 330
2 NaN T3 Shane Lowry -5 73 68 73 69 283 0,000 180
3 NaN T3 Cameron Smith -5 68 74 68 73 283 0,000 180
4 NaN 5 Collin Morikawa -4 73 70 74 67 284 0,000 120
... ... ... ... ... ... ... ... ... ... ... ...
86 NaN - Stewart Hagestad (a) CUT 79 81 -- -- 160 -- 0
87 NaN - José María Olazábal CUT 77 84 -- -- 161 -- 0
88 NaN - Laird Shepherd (a) CUT 81 85 -- -- 166 -- 0
89 NaN - Louis Oosthuizen WD 76 -- -- -- 76 -- 0
90 NaN - Paul Casey WD -- -- -- -- -- -- 0
希望这就是您要找的!因为 pandas 可以直接处理 html,所以实际上不需要制作任何中间列表。
如前所述,您可以让 Selenium 单击每个表格,然后使用 pandas
' .read_html()
来解析表格。但是,有一个 espn api,如果有一个 api 可用,那么与使用 Selenium 相比,以这种方式获取数据要好得多(更健壮和高效)。您可以获得的数据也远多于网站上显示的数据。基本上只要你有锦标赛 number/id,你就可以输入它,从排行榜中获取玩家 ID,然后遍历这些:
import requests
import pandas as pd
tournamentId = '401353232'
url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/leaderboard'
payload = {
'league': 'pga',
'region': 'us',
'lang': 'en',
'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
tournament = jsonData['events'][0]['name']
courses_dict = {}
for event in jsonData['events']:
for course in event['courses']:
courses_dict[str(course['id'])] = course['name']
print(tournament)
payload = {
'region': 'us',
'lang': 'en',
'season': '2022'}
results = pd.DataFrame()
for event in jsonData['events']:
competitions = event['competitions']
for competition in competitions:
competitors = competition['competitors']
for competitor in competitors:
playerName = competitor['athlete']['displayName']
playerId = competitor['athlete']['id']
status = competitor['status']
lastRound = status['period']
lastStatus = status['type']['detail']
url = f'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/{tournamentId}/competitorsummary/{playerId}'
rounds = requests.get(url, params=payload).json()['rounds']
for r in rounds:
courseName = courses_dict[str(r['courseId'])]
teeTime = r['teeTime']
period = r['period']
roundTotal = r['value']
roundStatus = lastStatus
if period < lastRound:
roundStatus = 'Finish'
linescores = r['linescores']
if len(linescores) == 0:
data = {'player':playerName,
'tournament':tournament,
'round':period}
temp_df = pd.DataFrame([data])
else:
temp_df = pd.DataFrame(linescores)
temp_df['player'] = playerName
temp_df['tournament'] = tournament
temp_df['round'] = period
temp_df = temp_df.pivot(
index=['player','tournament','round'],
columns='period',
values='value').reset_index(drop=False)
temp_df['course'] = courseName
temp_df['teeTime'] = teeTime
temp_df['status'] = roundStatus
results = pd.concat([results, temp_df], axis=0).reset_index(drop=True)
print('Collected: ', playerName)
输出:
print(results.head(10).to_string())
period player tournament course round 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
0 Scottie Scheffler Masters Tournament Augusta National Golf Club 1 4.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 3.0 4.0 4.0 2.0 5.0 4.0 5.0 3.0 3.0 5.0
1 Scottie Scheffler Masters Tournament Augusta National Golf Club 2 5.0 4.0 5.0 3.0 4.0 3.0 3.0 4.0 4.0 4.0 4.0 2.0 4.0 4.0 4.0 2.0 4.0 4.0
2 Scottie Scheffler Masters Tournament Augusta National Golf Club 3 4.0 4.0 3.0 4.0 4.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 5.0 6.0 3.0 3.0 5.0
3 Scottie Scheffler Masters Tournament Augusta National Golf Club 4 4.0 5.0 3.0 3.0 4.0 3.0 3.0 5.0 4.0 5.0 4.0 3.0 5.0 3.0 4.0 3.0 4.0 6.0
4 Rory McIlroy Masters Tournament Augusta National Golf Club 1 4.0 4.0 4.0 3.0 4.0 4.0 4.0 5.0 4.0 4.0 3.0 3.0 5.0 5.0 5.0 4.0 4.0 4.0
5 Rory McIlroy Masters Tournament Augusta National Golf Club 2 4.0 4.0 4.0 3.0 5.0 3.0 4.0 5.0 4.0 5.0 6.0 3.0 4.0 4.0 5.0 2.0 4.0 4.0
6 Rory McIlroy Masters Tournament Augusta National Golf Club 3 5.0 5.0 4.0 2.0 4.0 4.0 3.0 5.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.0 4.0 4.0
7 Rory McIlroy Masters Tournament Augusta National Golf Club 4 3.0 5.0 3.0 3.0 4.0 3.0 3.0 4.0 4.0 3.0 4.0 3.0 3.0 4.0 5.0 3.0 4.0 3.0
8 Shane Lowry Masters Tournament Augusta National Golf Club 1 4.0 5.0 5.0 3.0 4.0 3.0 4.0 5.0 4.0 5.0 4.0 3.0 3.0 3.0 7.0 3.0 4.0 4.0
9 Shane Lowry Masters Tournament Augusta National Golf Club 2 5.0 4.0 4.0 3.0 4.0 3.0 3.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 4.0 3.0 4.0 4.0
....
我正在通过 Pandas 将我拥有的数据转换为数据帧。我觉得这远非一项艰巨的任务,但我似乎无法弄清楚。我有 headers 我想要的数据框,我有数据,但这是来自网络的数据。我知道我需要将它变成一个列表,然后将其放入 DataFrame 函数中,但我不知道如何将它放入列表中。
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
down_arrow.click()
time.sleep(.5)
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
for R1_player in R1_leaderboard.find_all('tbody'):
R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
for R1_row in R1_rows:
R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
R1_Players = R1_row.find('a').text
R1_Round = R1_row.find_all("select")[1].text
R1_H1 = R1_row.find_all('span')[1].text
R1_H2 = R1_row.find_all('span')[2].text
R1_H3 = R1_row.find_all('span')[3].text
R1_H4 = R1_row.find_all('span')[4].text
R1_H5 = R1_row.find_all('span')[5].text
R1_H6 = R1_row.find_all('span')[6].text
R1_H7 = R1_row.find_all('span')[7].text
R1_H8 = R1_row.find_all('span')[8].text
R1_H9 = R1_row.find_all('span')[9].text
R1_H10 = R1_row.find_all('span')[11].text
R1_H11 = R1_row.find_all('span')[12].text
R1_H12 = R1_row.find_all('span')[13].text
R1_H13 = R1_row.find_all('span')[14].text
R1_H14 = R1_row.find_all('span')[15].text
R1_H15 = R1_row.find_all('span')[16].text
R1_H16 = R1_row.find_all('span')[17].text
R1_H17 = R1_row.find_all('span')[18].text
R1_H18 = R1_row.find_all('span')[19].text
print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)
这是您的代码的修改版本。我使用 pandas.read_html
将 html table 转换为数据框。
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
# i'm using colab to do this so i'm setting up the driver differently
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
# i had to add a try loop here bc i kept getting errors on down_arrow.click
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
try:
down_arrow.click()
time.sleep(.5)
except:
pass
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
try:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
except:
pass
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
R1_df = pd.read_html(R1_leaderboard.prettify())[0]
prettify
将 bs4 对象变成 pandas 可以处理的普通字符串。 read_html
实际上 returns 一个数据帧列表 - 但在这种情况下只有一个,这就是为什么我在最后添加 [0] 的原因。当我 运行 R1_df 我得到这个:
Unnamed: 0 POS PLAYER SCORE R1 R2 R3 R4 TOT EARNINGS FEDEX PTS
0 NaN 1 Scottie Scheffler -10 69 67 71 71 278 ,700,000 600
1 NaN 2 Rory McIlroy -7 73 73 71 64 281 ,620,000 330
2 NaN T3 Shane Lowry -5 73 68 73 69 283 0,000 180
3 NaN T3 Cameron Smith -5 68 74 68 73 283 0,000 180
4 NaN 5 Collin Morikawa -4 73 70 74 67 284 0,000 120
... ... ... ... ... ... ... ... ... ... ... ...
86 NaN - Stewart Hagestad (a) CUT 79 81 -- -- 160 -- 0
87 NaN - José María Olazábal CUT 77 84 -- -- 161 -- 0
88 NaN - Laird Shepherd (a) CUT 81 85 -- -- 166 -- 0
89 NaN - Louis Oosthuizen WD 76 -- -- -- 76 -- 0
90 NaN - Paul Casey WD -- -- -- -- -- -- 0
希望这就是您要找的!因为 pandas 可以直接处理 html,所以实际上不需要制作任何中间列表。
如前所述,您可以让 Selenium 单击每个表格,然后使用 pandas
' .read_html()
来解析表格。但是,有一个 espn api,如果有一个 api 可用,那么与使用 Selenium 相比,以这种方式获取数据要好得多(更健壮和高效)。您可以获得的数据也远多于网站上显示的数据。基本上只要你有锦标赛 number/id,你就可以输入它,从排行榜中获取玩家 ID,然后遍历这些:
import requests
import pandas as pd
tournamentId = '401353232'
url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/leaderboard'
payload = {
'league': 'pga',
'region': 'us',
'lang': 'en',
'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
tournament = jsonData['events'][0]['name']
courses_dict = {}
for event in jsonData['events']:
for course in event['courses']:
courses_dict[str(course['id'])] = course['name']
print(tournament)
payload = {
'region': 'us',
'lang': 'en',
'season': '2022'}
results = pd.DataFrame()
for event in jsonData['events']:
competitions = event['competitions']
for competition in competitions:
competitors = competition['competitors']
for competitor in competitors:
playerName = competitor['athlete']['displayName']
playerId = competitor['athlete']['id']
status = competitor['status']
lastRound = status['period']
lastStatus = status['type']['detail']
url = f'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/{tournamentId}/competitorsummary/{playerId}'
rounds = requests.get(url, params=payload).json()['rounds']
for r in rounds:
courseName = courses_dict[str(r['courseId'])]
teeTime = r['teeTime']
period = r['period']
roundTotal = r['value']
roundStatus = lastStatus
if period < lastRound:
roundStatus = 'Finish'
linescores = r['linescores']
if len(linescores) == 0:
data = {'player':playerName,
'tournament':tournament,
'round':period}
temp_df = pd.DataFrame([data])
else:
temp_df = pd.DataFrame(linescores)
temp_df['player'] = playerName
temp_df['tournament'] = tournament
temp_df['round'] = period
temp_df = temp_df.pivot(
index=['player','tournament','round'],
columns='period',
values='value').reset_index(drop=False)
temp_df['course'] = courseName
temp_df['teeTime'] = teeTime
temp_df['status'] = roundStatus
results = pd.concat([results, temp_df], axis=0).reset_index(drop=True)
print('Collected: ', playerName)
输出:
print(results.head(10).to_string())
period player tournament course round 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
0 Scottie Scheffler Masters Tournament Augusta National Golf Club 1 4.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 3.0 4.0 4.0 2.0 5.0 4.0 5.0 3.0 3.0 5.0
1 Scottie Scheffler Masters Tournament Augusta National Golf Club 2 5.0 4.0 5.0 3.0 4.0 3.0 3.0 4.0 4.0 4.0 4.0 2.0 4.0 4.0 4.0 2.0 4.0 4.0
2 Scottie Scheffler Masters Tournament Augusta National Golf Club 3 4.0 4.0 3.0 4.0 4.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 5.0 6.0 3.0 3.0 5.0
3 Scottie Scheffler Masters Tournament Augusta National Golf Club 4 4.0 5.0 3.0 3.0 4.0 3.0 3.0 5.0 4.0 5.0 4.0 3.0 5.0 3.0 4.0 3.0 4.0 6.0
4 Rory McIlroy Masters Tournament Augusta National Golf Club 1 4.0 4.0 4.0 3.0 4.0 4.0 4.0 5.0 4.0 4.0 3.0 3.0 5.0 5.0 5.0 4.0 4.0 4.0
5 Rory McIlroy Masters Tournament Augusta National Golf Club 2 4.0 4.0 4.0 3.0 5.0 3.0 4.0 5.0 4.0 5.0 6.0 3.0 4.0 4.0 5.0 2.0 4.0 4.0
6 Rory McIlroy Masters Tournament Augusta National Golf Club 3 5.0 5.0 4.0 2.0 4.0 4.0 3.0 5.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.0 4.0 4.0
7 Rory McIlroy Masters Tournament Augusta National Golf Club 4 3.0 5.0 3.0 3.0 4.0 3.0 3.0 4.0 4.0 3.0 4.0 3.0 3.0 4.0 5.0 3.0 4.0 3.0
8 Shane Lowry Masters Tournament Augusta National Golf Club 1 4.0 5.0 5.0 3.0 4.0 3.0 4.0 5.0 4.0 5.0 4.0 3.0 3.0 3.0 7.0 3.0 4.0 4.0
9 Shane Lowry Masters Tournament Augusta National Golf Club 2 5.0 4.0 4.0 3.0 4.0 3.0 3.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 4.0 3.0 4.0 4.0
....