使用 BeautifulSoup 动态抓取分页 table 并将结果存储在 csv 中?

Dynamically scrape paginated table with BeautifulSoup and store results in csv?

代码运行但数据框为空。 在下面的 URL 中,YEARPAGE 都是动态的。 我想遍历两者并获取 table td 和(如果可能的话)acc 下的依赖数据。 date 并在每年的 year.csv 中提取结果。

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})


with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table > tbody > tr'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers > span.current + div.a'):
            url = soup.select_one('div.pagenumbers > span.current + div.a')['href']
        else:
            break

会发生什么?

首先,总是看汤——这就是真相。

您在 while 循环的请求中缺少 headers,这会导致 403 错误并且 table 不 select 正确。

如何实现?

在 while 循环中为您的请求正确设置 headers:

html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})

Select 您的行更具体 - 请注意 html:

中没有 tbody
        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):

还要检查分页的 select 或:

# If more than one page then iterate through all of them        
if soup.select_one('div.pagenumbers span.current + a'):
    url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
    break

例子

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}

with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url , headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current + a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
        else:
            break

以防万一

pandas.read_html() 的替代解决方案在所有年份中迭代:

import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')


data = []

for url in ['https://aviation-safety.net/'+a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
    while True:

        html = requests.get(url, headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        data.append(pd.read_html(soup.prettify())[0])

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current + a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
        else:
            break
        time.sleep(random.random())

df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)

我对您的脚本进行了一些更改,应该可以更轻松地进行调试和维护。它使用 pandas 使写入 CSV 更容易,并使用 concurrent.futures 来加快速度。如果你有问题告诉我,基本上每年都是同时抓取的,我抓取第一页来获取要抓取的页数,然后我遍历每个页面并解析 HTML。关键信息被放入字典中,然后添加到列表中(通过 pandas 更容易写入 csv,因为它基本上已经是一个数据框 - 字典列表)

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])

    info = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv',index=False)


if __name__ == "__main__":

    START = 1916
    STOP = 2022

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)