使用 BeautifulSoup 动态抓取分页 table 并将结果存储在 csv 中？

Question

代码运行但数据框为空。在下面的 URL 中，YEAR 和 PAGE 都是动态的。我想遍历两者并获取 table td 和（如果可能的话）acc 下的依赖数据。 date 并在每年的 year.csv 中提取结果。

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})


with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table > tbody > tr'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers > span.current + div.a'):
            url = soup.select_one('div.pagenumbers > span.current + div.a')['href']
        else:
            break

Answer 1

会发生什么？

首先，总是看汤——这就是真相。

您在 while 循环的请求中缺少 headers，这会导致 403 错误并且 table 不 select 正确。

如何实现？

在 while 循环中为您的请求正确设置 headers：

html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})

Select 您的行更具体 - 请注意 html:

中没有 tbody

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):

还要检查分页的 select 或：

# If more than one page then iterate through all of them        
if soup.select_one('div.pagenumbers span.current + a'):
    url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
    break

例子

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}

with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url , headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current + a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
        else:
            break

以防万一

pandas.read_html() 的替代解决方案在所有年份中迭代：

import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')


data = []

for url in ['https://aviation-safety.net/'+a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
    while True:

        html = requests.get(url, headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        data.append(pd.read_html(soup.prettify())[0])

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current + a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
        else:
            break
        time.sleep(random.random())

df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)

Answer 2

我对您的脚本进行了一些更改，应该可以更轻松地进行调试和维护。它使用 pandas 使写入 CSV 更容易，并使用 concurrent.futures 来加快速度。如果你有问题告诉我，基本上每年都是同时抓取的，我抓取第一页来获取要抓取的页数，然后我遍历每个页面并解析 HTML。关键信息被放入字典中，然后添加到列表中（通过 pandas 更容易写入 csv，因为它基本上已经是一个数据框 - 字典列表）

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])

    info = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv',index=False)


if __name__ == "__main__":

    START = 1916
    STOP = 2022

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

使用 BeautifulSoup 动态抓取分页 table 并将结果存储在 csv 中？

Dynamically scrape paginated table with BeautifulSoup and store results in csv?

python

csv

pagination

beautifulsoup

web-scraping

会发生什么？

如何实现？

例子

以防万一