Beautiful Soup 网络爬虫:尝试过滤我要解析的特定行
Beautiful Soup web crawler: Trying to filter specific rows I want to parse
我构建了一个网络爬虫,这是它爬取的其中一个页面的示例:
https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos
我只想获取其中包含 'NCAA' 或 'NAIA' 或 'NWDS' 的行。目前以下代码获取页面上的所有行,我尝试过滤它并不完全有效。
这是爬虫的代码:
''' Crawling Function '''
# Must enter the 'id' number that is at the end of a teams baseball
# reference page link in order to use this function
def crawl(_id):
bbref = 'https://www.baseball-reference.com/register/team.cgi?id='
html = sewp(bbref + _id)
href_tags = html.find_all(href = True)
href_tags = list(href_tags)
hrefs = [tag.get('href') for tag in href_tags]
# Append relevant links to list
player_links = []
for href in hrefs:
if '/register/player.fcgi?id=' in href:
player_links.append(href)
# finish by Returning the player data to a list
player_data = []
for link in player_links:
player_data.append(find_data('https://www.baseball-reference.com' + link))
for df in player_data:
numeric(df)
return player_data
这里是函数“find_data”的代码,我在试图过滤已解析行的爬虫中爬行:
def find_data(url):
page = requests.get(url, headers=headers)
text = soup(page.text, features='lxml')
row = text.find_all('tr')
''' Attempting to parse row, but only if NCAA, NAIA, or NWDS in row '''
data = []
for r in row:
if 'NCAA' in r or 'NWDS' in r or 'NAIA' in r:
data.append(parse_row(r))
# data = [parse_row(rows) for rows in row]
df = pd.DataFrame(data)
return df
其他功能:
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def numeric(frame):
for i in frame.columns:
try:
frame[[i]] = frame[[i]].astype(float)
except:
pass
def sewp(url):
r = requests.get(url, headers = headers)
html = soup(r.text, features = 'lxml')
return html
下面是我用来做抓取功能的函数
# dict of team id's for desired league by year
nwl_team_id_dict_21 = {'kokomo':'08eb649f', 'pit-spitters':'6f0d2cd3','kingfish':'79c106fa',
'rivets':'f61e0ce3','bombers':'3cbb765d','growlers':'b0a9f9bc','spiders':'355b5892',
'mallards':'8eaa34fb','chinooks':'2ffd9848','booyah':'604e6d45','bucks':'2b013943',
'huskies':'95a9931c','loggers':'896f45b5','express':'ee0f0409','mud-puppies':'f46a2140',
'rox':'0b75a745','moondogs':'7027a89b','woodchucks':'f22e21d1','rafters':'8f0328cd',
'stingers':'1884dcb1','honkers':'0b4021dc','larks':'5bed69fc'}
def get_league(league_dict):
all_players = []
for team in league_dict.values():
all_players.append(crawl(team))
return all_players
问题是因为你检查
'NCAA' in ["<a>NCAA</a>", "<span>OTHER</span>"]
这给出 False
它使用 'NCAA' == "<a>NCAA</a>"
(给出 False
)检查列表中的每个元素,而不是使用 'NCAA' in "<a>NCAA</a>"
(给出 True
)
您必须先运行r = parse_row(r)
检查
'NCAA' in ["NCAA", "OTHER"]
这会起作用
最少的工作代码
import requests
''' Crawling Function '''
from bs4 import BeautifulSoup as soup
import pandas as pd
headers = {}
def find_data(url):
print(url)
page = requests.get(url, headers=headers)
text = soup(page.text, features='lxml')
row = text.find_all('tr')
''' Attempting to parse row, but only if NCAA, NAIA, or NWDS in row '''
data = []
for r in row:
r = parse_row(r)
#print(r)
if 'NCAA' in r or 'NWDS' in r or 'NAIA' in r:
#r = parse_row(r)
#print(r)
data.append(r)
df = pd.DataFrame(data)
return df
def parse_row(rows):
return [x.string for x in rows.find_all('td')]
#return [x.get_text() for x in rows.find_all('td')]
find_data('https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos')
结果:
0 1 2 3 4 5 6 ... 22 23 24 25 26 27 28
0 19 -2.2 Grand Canyon WAC NCAA None 4 ... .000 0 0 0 1 0 None
1 20 -1.8 Grand Canyon WAC NCAA None 32 ... .448 15 None 3 5 1 None
2 20 -0.5 Kokomo NWDS Smr None 35 ... .512 23 3 4 0 1 None
[3 rows x 29 columns]
您可以通过让 pandas 解析 table 和行然后向量化检查而不是遍历每一行来提高代码效率。
代码:
def find_data(url, includeKeywords):
df = pd.read_html(url)[0]
df = df[df.stack().str.contains('|'.join(includeKeywords)).any(level=0)]
return df
includeKeywords = ['NCAA','NWDS','NAIA']
find_data('https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos', includeKeywords)
输出:
Year Age AgeDif Tm Lg Lev ... TB GDP HBP SH SF IBB
0 2020 19 -2.2 Grand Canyon WAC NCAA ... 0 0.0 0 1 0 NaN
1 2021 20 -1.8 Grand Canyon WAC NCAA ... 15 NaN 3 5 1 NaN
2 2021 20 -0.5 Kokomo NWDS Smr ... 23 3.0 4 0 1 NaN
[3 rows x 30 columns]
完整代码:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
''' Crawling Function '''
# Must enter the 'id' number that is at the end of a teams baseball
# reference page link in order to use this function
def crawl(_id):
bbref = 'https://www.baseball-reference.com/register/team.cgi?id='
html = sewp(bbref + _id)
href_tags = html.find_all(href = True)
href_tags = list(href_tags)
hrefs = {tag.text:tag.get('href') for tag in href_tags}
# Append relevant links to list
player_links = {}
for playerName, href in hrefs.items():
if '/register/player.fcgi?id=' in href:
player_links.update({playerName:href})
# finish by Returning the player data to a list
player_data = []
for playerName, link in player_links.items():
player_data.append(find_data('https://www.baseball-reference.com' + link, includeKeywords, playerName))
for df in player_data:
numeric(df)
player_data = pd.concat(player_data, axis=0)
return player_data
def find_data(url, includeKeywords, playerName):
print(playerName)
df = pd.read_html(url)[0].dropna(axis=0, how='all')
df = df[df.stack().str.contains('|'.join(includeKeywords)).any(level=0)]
df['playerName'] = playerName
return df
def numeric(frame):
for i in frame.columns:
try:
frame[[i]] = frame[[i]].astype(float)
except:
pass
def sewp(url):
r = requests.get(url, headers = headers)
html = soup(r.text, features = 'lxml')
return html
# dict of team id's for desired league by year
nwl_team_id_dict_21 = {'kokomo':'08eb649f', 'pit-spitters':'6f0d2cd3','kingfish':'79c106fa',
'rivets':'f61e0ce3','bombers':'3cbb765d','growlers':'b0a9f9bc','spiders':'355b5892',
'mallards':'8eaa34fb','chinooks':'2ffd9848','booyah':'604e6d45','bucks':'2b013943',
'huskies':'95a9931c','loggers':'896f45b5','express':'ee0f0409','mud-puppies':'f46a2140',
'rox':'0b75a745','moondogs':'7027a89b','woodchucks':'f22e21d1','rafters':'8f0328cd',
'stingers':'1884dcb1','honkers':'0b4021dc','larks':'5bed69fc'}
includeKeywords = ['NCAA','NWDS','NAIA']
results = []
for team, _id in nwl_team_id_dict_21.items():
print('\n', team.title())
player_data = crawl(_id)
results.append(player_data)
results = pd.concat(results, axis=0)
输出:
print(results)
Year Age AgeDif Tm Lg ... H9 HR9 BB9 SO9 SO/W
0 2020.0 19.0 -2.2 Grand Canyon WAC ... NaN NaN NaN NaN NaN
1 2021.0 20.0 -1.8 Grand Canyon WAC ... NaN NaN NaN NaN NaN
2 2021.0 20.0 -0.5 Kokomo NWDS ... NaN NaN NaN NaN NaN
1 2020.0 21.0 -0.3 Embry-Riddle (FL) SSC ... NaN NaN NaN NaN NaN
3 2021.0 22.0 1.5 Kokomo NWDS ... NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... .. ... ... ... ...
5 2022.0 22.0 -0.2 Montevallo GSC ... NaN NaN NaN NaN NaN
0 2020.0 22.0 1.1 Kansas State B12 ... NaN NaN NaN NaN NaN
2 2021.0 23.0 1.6 Kansas State B12 ... NaN NaN NaN NaN NaN
3 2021.0 23.0 2.5 Bismarck NWDS ... NaN NaN NaN NaN NaN
4 2022.0 24.0 2.3 New Mexico MWC ... NaN NaN NaN NaN NaN
[2378 rows x 52 columns]
我构建了一个网络爬虫,这是它爬取的其中一个页面的示例:
https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos
我只想获取其中包含 'NCAA' 或 'NAIA' 或 'NWDS' 的行。目前以下代码获取页面上的所有行,我尝试过滤它并不完全有效。
这是爬虫的代码:
''' Crawling Function '''
# Must enter the 'id' number that is at the end of a teams baseball
# reference page link in order to use this function
def crawl(_id):
bbref = 'https://www.baseball-reference.com/register/team.cgi?id='
html = sewp(bbref + _id)
href_tags = html.find_all(href = True)
href_tags = list(href_tags)
hrefs = [tag.get('href') for tag in href_tags]
# Append relevant links to list
player_links = []
for href in hrefs:
if '/register/player.fcgi?id=' in href:
player_links.append(href)
# finish by Returning the player data to a list
player_data = []
for link in player_links:
player_data.append(find_data('https://www.baseball-reference.com' + link))
for df in player_data:
numeric(df)
return player_data
这里是函数“find_data”的代码,我在试图过滤已解析行的爬虫中爬行:
def find_data(url):
page = requests.get(url, headers=headers)
text = soup(page.text, features='lxml')
row = text.find_all('tr')
''' Attempting to parse row, but only if NCAA, NAIA, or NWDS in row '''
data = []
for r in row:
if 'NCAA' in r or 'NWDS' in r or 'NAIA' in r:
data.append(parse_row(r))
# data = [parse_row(rows) for rows in row]
df = pd.DataFrame(data)
return df
其他功能:
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def numeric(frame):
for i in frame.columns:
try:
frame[[i]] = frame[[i]].astype(float)
except:
pass
def sewp(url):
r = requests.get(url, headers = headers)
html = soup(r.text, features = 'lxml')
return html
下面是我用来做抓取功能的函数
# dict of team id's for desired league by year
nwl_team_id_dict_21 = {'kokomo':'08eb649f', 'pit-spitters':'6f0d2cd3','kingfish':'79c106fa',
'rivets':'f61e0ce3','bombers':'3cbb765d','growlers':'b0a9f9bc','spiders':'355b5892',
'mallards':'8eaa34fb','chinooks':'2ffd9848','booyah':'604e6d45','bucks':'2b013943',
'huskies':'95a9931c','loggers':'896f45b5','express':'ee0f0409','mud-puppies':'f46a2140',
'rox':'0b75a745','moondogs':'7027a89b','woodchucks':'f22e21d1','rafters':'8f0328cd',
'stingers':'1884dcb1','honkers':'0b4021dc','larks':'5bed69fc'}
def get_league(league_dict):
all_players = []
for team in league_dict.values():
all_players.append(crawl(team))
return all_players
问题是因为你检查
'NCAA' in ["<a>NCAA</a>", "<span>OTHER</span>"]
这给出 False
它使用 'NCAA' == "<a>NCAA</a>"
(给出 False
)检查列表中的每个元素,而不是使用 'NCAA' in "<a>NCAA</a>"
(给出 True
)
您必须先运行r = parse_row(r)
检查
'NCAA' in ["NCAA", "OTHER"]
这会起作用
最少的工作代码
import requests
''' Crawling Function '''
from bs4 import BeautifulSoup as soup
import pandas as pd
headers = {}
def find_data(url):
print(url)
page = requests.get(url, headers=headers)
text = soup(page.text, features='lxml')
row = text.find_all('tr')
''' Attempting to parse row, but only if NCAA, NAIA, or NWDS in row '''
data = []
for r in row:
r = parse_row(r)
#print(r)
if 'NCAA' in r or 'NWDS' in r or 'NAIA' in r:
#r = parse_row(r)
#print(r)
data.append(r)
df = pd.DataFrame(data)
return df
def parse_row(rows):
return [x.string for x in rows.find_all('td')]
#return [x.get_text() for x in rows.find_all('td')]
find_data('https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos')
结果:
0 1 2 3 4 5 6 ... 22 23 24 25 26 27 28
0 19 -2.2 Grand Canyon WAC NCAA None 4 ... .000 0 0 0 1 0 None
1 20 -1.8 Grand Canyon WAC NCAA None 32 ... .448 15 None 3 5 1 None
2 20 -0.5 Kokomo NWDS Smr None 35 ... .512 23 3 4 0 1 None
[3 rows x 29 columns]
您可以通过让 pandas 解析 table 和行然后向量化检查而不是遍历每一行来提高代码效率。
代码:
def find_data(url, includeKeywords):
df = pd.read_html(url)[0]
df = df[df.stack().str.contains('|'.join(includeKeywords)).any(level=0)]
return df
includeKeywords = ['NCAA','NWDS','NAIA']
find_data('https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos', includeKeywords)
输出:
Year Age AgeDif Tm Lg Lev ... TB GDP HBP SH SF IBB
0 2020 19 -2.2 Grand Canyon WAC NCAA ... 0 0.0 0 1 0 NaN
1 2021 20 -1.8 Grand Canyon WAC NCAA ... 15 NaN 3 5 1 NaN
2 2021 20 -0.5 Kokomo NWDS Smr ... 23 3.0 4 0 1 NaN
[3 rows x 30 columns]
完整代码:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
''' Crawling Function '''
# Must enter the 'id' number that is at the end of a teams baseball
# reference page link in order to use this function
def crawl(_id):
bbref = 'https://www.baseball-reference.com/register/team.cgi?id='
html = sewp(bbref + _id)
href_tags = html.find_all(href = True)
href_tags = list(href_tags)
hrefs = {tag.text:tag.get('href') for tag in href_tags}
# Append relevant links to list
player_links = {}
for playerName, href in hrefs.items():
if '/register/player.fcgi?id=' in href:
player_links.update({playerName:href})
# finish by Returning the player data to a list
player_data = []
for playerName, link in player_links.items():
player_data.append(find_data('https://www.baseball-reference.com' + link, includeKeywords, playerName))
for df in player_data:
numeric(df)
player_data = pd.concat(player_data, axis=0)
return player_data
def find_data(url, includeKeywords, playerName):
print(playerName)
df = pd.read_html(url)[0].dropna(axis=0, how='all')
df = df[df.stack().str.contains('|'.join(includeKeywords)).any(level=0)]
df['playerName'] = playerName
return df
def numeric(frame):
for i in frame.columns:
try:
frame[[i]] = frame[[i]].astype(float)
except:
pass
def sewp(url):
r = requests.get(url, headers = headers)
html = soup(r.text, features = 'lxml')
return html
# dict of team id's for desired league by year
nwl_team_id_dict_21 = {'kokomo':'08eb649f', 'pit-spitters':'6f0d2cd3','kingfish':'79c106fa',
'rivets':'f61e0ce3','bombers':'3cbb765d','growlers':'b0a9f9bc','spiders':'355b5892',
'mallards':'8eaa34fb','chinooks':'2ffd9848','booyah':'604e6d45','bucks':'2b013943',
'huskies':'95a9931c','loggers':'896f45b5','express':'ee0f0409','mud-puppies':'f46a2140',
'rox':'0b75a745','moondogs':'7027a89b','woodchucks':'f22e21d1','rafters':'8f0328cd',
'stingers':'1884dcb1','honkers':'0b4021dc','larks':'5bed69fc'}
includeKeywords = ['NCAA','NWDS','NAIA']
results = []
for team, _id in nwl_team_id_dict_21.items():
print('\n', team.title())
player_data = crawl(_id)
results.append(player_data)
results = pd.concat(results, axis=0)
输出:
print(results)
Year Age AgeDif Tm Lg ... H9 HR9 BB9 SO9 SO/W
0 2020.0 19.0 -2.2 Grand Canyon WAC ... NaN NaN NaN NaN NaN
1 2021.0 20.0 -1.8 Grand Canyon WAC ... NaN NaN NaN NaN NaN
2 2021.0 20.0 -0.5 Kokomo NWDS ... NaN NaN NaN NaN NaN
1 2020.0 21.0 -0.3 Embry-Riddle (FL) SSC ... NaN NaN NaN NaN NaN
3 2021.0 22.0 1.5 Kokomo NWDS ... NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... .. ... ... ... ...
5 2022.0 22.0 -0.2 Montevallo GSC ... NaN NaN NaN NaN NaN
0 2020.0 22.0 1.1 Kansas State B12 ... NaN NaN NaN NaN NaN
2 2021.0 23.0 1.6 Kansas State B12 ... NaN NaN NaN NaN NaN
3 2021.0 23.0 2.5 Bismarck NWDS ... NaN NaN NaN NaN NaN
4 2022.0 24.0 2.3 New Mexico MWC ... NaN NaN NaN NaN NaN
[2378 rows x 52 columns]