使用 python、csv、beautifulsoup 和 Pandas 进行网页抓取和分页

Question

本网站https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. The code presented here captures some years for misses some as well. The years before 1912 and the year after 2021 are not captured. I want to scrape All Accidents for each type of aircraft for all or by year(s). This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022。目前，代码将结果转储到 .csv 文件中，但它也可能在 SQLite 中。

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):
    # use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])   # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number

    info = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)


if __name__ == "__main__":

    START = 1901
    STOP = 2023

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

Answer 1

Lmao，我以前在这个网站上为某人写过那个代码。我在这里编辑了失踪年的工作：

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    try:
        headers =   {
            'accept':'*/*',
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
            }

        url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
        req = requests.get(url, headers=headers)

        soup = BeautifulSoup(req.text,'html.parser')

        page_container = soup.find('div',{'class':'pagenumbers'})

        try:
            pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        except:
            pages = 1

        info = []
        for page in range(1,pages+1):

            new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
            print(new_url)

            data = requests.get(new_url,headers=headers)
            soup = BeautifulSoup(data.text,'html.parser')


            table = soup.find('table',{'class':'hp'})


            regex = re.compile('list.*')
            for row in table.find_all('tr',{'class':regex}):

                acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
                try:
                    acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        try:
                            acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                        except ValueError:
                            continue

                acc_type = row.find_all('td')[1].text
                acc_reg = row.find_all('td')[2].text
                acc_operator = row.find_all('td')[3].text
                acc_fat = row.find_all('td')[4].text
                acc_location = row.find_all('td')[5].text
                acc_dmg = row.find_all('td')[7].text

                item = {
                    'acc_link' : acc_link,
                    'acc_date': acc_date,
                    'acc_type': acc_type,
                    'acc_reg': acc_reg,
                    'acc_operator' :acc_operator,
                    'acc_fat':acc_fat,
                    'acc_location':acc_location,
                    'acc_dmg':acc_dmg
                    }

                info.append(item)

        return info

    except Exception as e:
        print(e, url)
        return []


if __name__ == "__main__":

    START = 2022
    STOP = 2023

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scrape_year,years)

    list_of_dicts= list(final_list)
    flat_list = [item for sublist in list_of_dicts for item in sublist] #convert list of lists into one big list

    df= pd.DataFrame(flat_list)
    df.to_csv('all_years_aviation-safety.csv',index=False)

使用 python、csv、beautifulsoup 和 Pandas 进行网页抓取和分页

web-scraping and pagination with python, csv, beautifulsoup and Pandas

python

csv

pagination

beautifulsoup

web-scraping