Beautiful Soup 网络爬虫:尝试过滤我要解析的特定行

Beautiful Soup web crawler: Trying to filter specific rows I want to parse

我构建了一个网络爬虫,这是它爬取的其中一个页面的示例:

https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos

我只想获取其中包含 'NCAA' 或 'NAIA' 或 'NWDS' 的行。目前以下代码获取页面上的所有行,我尝试过滤它并不完全有效。

这是爬虫的代码:

''' Crawling Function '''

# Must enter the 'id' number that is at the end of a teams baseball
# reference page link in order to use this function
def crawl(_id):
    bbref = 'https://www.baseball-reference.com/register/team.cgi?id='
    html = sewp(bbref + _id)
    href_tags = html.find_all(href = True)
    href_tags = list(href_tags)
    hrefs = [tag.get('href') for tag in href_tags]
    
    # Append relevant links to list
    player_links = []
    for href in hrefs:
        if '/register/player.fcgi?id=' in href:
            player_links.append(href)
    
    # finish by Returning the player data to a list
    player_data = []
    for link in player_links:
        player_data.append(find_data('https://www.baseball-reference.com' + link))
    for df in player_data:
        numeric(df)
    return player_data

这里是函数“find_data”的代码,我在试图过滤已解析行的爬虫中爬行:

def find_data(url):
    page = requests.get(url, headers=headers)
    text = soup(page.text, features='lxml')
    row = text.find_all('tr')

    ''' Attempting to parse row, but only if NCAA, NAIA, or NWDS in row '''
    data = []
    for r in row:
        if 'NCAA' in r or 'NWDS' in r or 'NAIA' in r:
            data.append(parse_row(r))

            # data = [parse_row(rows) for rows in row]
    df = pd.DataFrame(data)
    return df

其他功能:

def parse_row(rows):
    return [str(x.string)for x in rows.find_all('td')]

def numeric(frame):
    for i in frame.columns:
        try:
            frame[[i]] = frame[[i]].astype(float)
        except:
            pass
def sewp(url):
    r = requests.get(url, headers = headers)
    html = soup(r.text, features = 'lxml')
    return html

下面是我用来做抓取功能的函数

# dict of team id's for desired league by year
nwl_team_id_dict_21 = {'kokomo':'08eb649f', 'pit-spitters':'6f0d2cd3','kingfish':'79c106fa',
                       'rivets':'f61e0ce3','bombers':'3cbb765d','growlers':'b0a9f9bc','spiders':'355b5892',
                       'mallards':'8eaa34fb','chinooks':'2ffd9848','booyah':'604e6d45','bucks':'2b013943',
                       'huskies':'95a9931c','loggers':'896f45b5','express':'ee0f0409','mud-puppies':'f46a2140',
                       'rox':'0b75a745','moondogs':'7027a89b','woodchucks':'f22e21d1','rafters':'8f0328cd',
                       'stingers':'1884dcb1','honkers':'0b4021dc','larks':'5bed69fc'}

def get_league(league_dict):
    all_players = []
    for team in league_dict.values():
        all_players.append(crawl(team))
    return all_players

问题是因为你检查

'NCAA' in ["<a>NCAA</a>", "<span>OTHER</span>"]

这给出 False

它使用 'NCAA' == "<a>NCAA</a>"(给出 False)检查列表中的每个元素,而不是使用 'NCAA' in "<a>NCAA</a>"(给出 True

您必须先运行r = parse_row(r)检查

'NCAA' in ["NCAA", "OTHER"]

这会起作用


最少的工作代码

import requests
''' Crawling Function '''
from bs4 import BeautifulSoup as soup
import pandas as pd

headers = {}

def find_data(url):
        print(url)
        
        page = requests.get(url, headers=headers)
        text = soup(page.text, features='lxml')
        row = text.find_all('tr')
        
        ''' Attempting to parse row, but only if NCAA, NAIA, or NWDS in row '''
        data = []
        
        for r in row:
          r = parse_row(r)
          #print(r)
          if 'NCAA' in r or 'NWDS' in r or 'NAIA' in r:
              #r = parse_row(r)
              #print(r)
              data.append(r) 

        df = pd.DataFrame(data)

        return df
    
def parse_row(rows):
    return [x.string for x in rows.find_all('td')]
    #return [x.get_text() for x in rows.find_all('td')]

find_data('https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos')

结果:

   0     1             2     3     4     5   6   ...    22  23    24 25 26 27    28
0  19  -2.2  Grand Canyon   WAC  NCAA  None   4  ...  .000   0     0  0  1  0  None
1  20  -1.8  Grand Canyon   WAC  NCAA  None  32  ...  .448  15  None  3  5  1  None
2  20  -0.5        Kokomo  NWDS   Smr  None  35  ...  .512  23     3  4  0  1  None

[3 rows x 29 columns]

您可以通过让 pandas 解析 table 和行然后向量化检查而不是遍历每一行来提高代码效率。

代码:

def find_data(url, includeKeywords):
    df = pd.read_html(url)[0]
    df = df[df.stack().str.contains('|'.join(includeKeywords)).any(level=0)]
    return df

includeKeywords = ['NCAA','NWDS','NAIA']
find_data('https://www.baseball-reference.com/register/player.fcgi?id=buckle002jos', includeKeywords)

输出:

   Year Age AgeDif            Tm    Lg   Lev  ...  TB  GDP  HBP  SH  SF  IBB
0  2020  19   -2.2  Grand Canyon   WAC  NCAA  ...   0  0.0    0   1   0  NaN
1  2021  20   -1.8  Grand Canyon   WAC  NCAA  ...  15  NaN    3   5   1  NaN
2  2021  20   -0.5        Kokomo  NWDS   Smr  ...  23  3.0    4   0   1  NaN

[3 rows x 30 columns]

完整代码:

import requests
from bs4 import BeautifulSoup as soup
import pandas as pd


headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}

''' Crawling Function '''

# Must enter the 'id' number that is at the end of a teams baseball
# reference page link in order to use this function
def crawl(_id):
    bbref = 'https://www.baseball-reference.com/register/team.cgi?id='
    html = sewp(bbref + _id)
    href_tags = html.find_all(href = True)
    href_tags = list(href_tags)
    hrefs = {tag.text:tag.get('href') for tag in href_tags}
    
    # Append relevant links to list
    player_links = {}
    for playerName, href in hrefs.items():
        if '/register/player.fcgi?id=' in href:
            player_links.update({playerName:href})
    
    # finish by Returning the player data to a list
    player_data = []
    for playerName, link in player_links.items():
        player_data.append(find_data('https://www.baseball-reference.com' + link, includeKeywords, playerName))
    for df in player_data:
        numeric(df)
    player_data = pd.concat(player_data, axis=0)
    return player_data

def find_data(url, includeKeywords, playerName):
    print(playerName)
    df = pd.read_html(url)[0].dropna(axis=0, how='all')
    df = df[df.stack().str.contains('|'.join(includeKeywords)).any(level=0)]
    df['playerName'] = playerName
    return df

def numeric(frame):
    for i in frame.columns:
        try:
            frame[[i]] = frame[[i]].astype(float)
        except:
            pass
def sewp(url):
    r = requests.get(url, headers = headers)
    html = soup(r.text, features = 'lxml')
    return html

# dict of team id's for desired league by year
nwl_team_id_dict_21 = {'kokomo':'08eb649f', 'pit-spitters':'6f0d2cd3','kingfish':'79c106fa',
                       'rivets':'f61e0ce3','bombers':'3cbb765d','growlers':'b0a9f9bc','spiders':'355b5892',
                       'mallards':'8eaa34fb','chinooks':'2ffd9848','booyah':'604e6d45','bucks':'2b013943',
                       'huskies':'95a9931c','loggers':'896f45b5','express':'ee0f0409','mud-puppies':'f46a2140',
                       'rox':'0b75a745','moondogs':'7027a89b','woodchucks':'f22e21d1','rafters':'8f0328cd',
                       'stingers':'1884dcb1','honkers':'0b4021dc','larks':'5bed69fc'}



includeKeywords = ['NCAA','NWDS','NAIA']
results = []
for team, _id in nwl_team_id_dict_21.items():
    print('\n', team.title())
    player_data = crawl(_id)
    results.append(player_data)
results = pd.concat(results, axis=0)

输出:

print(results)
      Year   Age AgeDif                 Tm    Lg  ...  H9  HR9  BB9  SO9  SO/W
0   2020.0  19.0   -2.2       Grand Canyon   WAC  ... NaN  NaN  NaN  NaN   NaN
1   2021.0  20.0   -1.8       Grand Canyon   WAC  ... NaN  NaN  NaN  NaN   NaN
2   2021.0  20.0   -0.5             Kokomo  NWDS  ... NaN  NaN  NaN  NaN   NaN
1   2020.0  21.0   -0.3  Embry-Riddle (FL)   SSC  ... NaN  NaN  NaN  NaN   NaN
3   2021.0  22.0    1.5             Kokomo  NWDS  ... NaN  NaN  NaN  NaN   NaN
..     ...   ...    ...                ...   ...  ...  ..  ...  ...  ...   ...
5   2022.0  22.0   -0.2         Montevallo   GSC  ... NaN  NaN  NaN  NaN   NaN
0   2020.0  22.0    1.1       Kansas State   B12  ... NaN  NaN  NaN  NaN   NaN
2   2021.0  23.0    1.6       Kansas State   B12  ... NaN  NaN  NaN  NaN   NaN
3   2021.0  23.0    2.5           Bismarck  NWDS  ... NaN  NaN  NaN  NaN   NaN
4   2022.0  24.0    2.3         New Mexico   MWC  ... NaN  NaN  NaN  NaN   NaN

[2378 rows x 52 columns]