使用 BeautifulSoup 动态抓取分页 table 并将结果存储在 csv 中?
Dynamically scrape paginated table with BeautifulSoup and store results in csv?
代码运行但数据框为空。
在下面的 URL 中,YEAR 和 PAGE 都是动态的。
我想遍历两者并获取 table td 和(如果可能的话)acc 下的依赖数据。 date 并在每年的 year.csv 中提取结果。
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table > tbody > tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers > span.current + div.a'):
url = soup.select_one('div.pagenumbers > span.current + div.a')['href']
else:
break
会发生什么?
首先,总是看汤——这就是真相。
您在 while 循环的请求中缺少 headers,这会导致 403
错误并且 table 不 select 正确。
如何实现?
在 while 循环中为您的请求正确设置 headers:
html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
Select 您的行更具体 - 请注意 html:
中没有 tbody
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
还要检查分页的 select 或:
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
例子
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url , headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
以防万一
pandas.read_html()
的替代解决方案在所有年份中迭代:
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')
data = []
for url in ['https://aviation-safety.net/'+a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
while True:
html = requests.get(url, headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.prettify())[0])
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
time.sleep(random.random())
df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)
我对您的脚本进行了一些更改,应该可以更轻松地进行调试和维护。它使用 pandas 使写入 CSV 更容易,并使用 concurrent.futures 来加快速度。如果你有问题告诉我,基本上每年都是同时抓取的,我抓取第一页来获取要抓取的页数,然后我遍历每个页面并解析 HTML。关键信息被放入字典中,然后添加到列表中(通过 pandas 更容易写入 csv,因为它基本上已经是一个数据框 - 字典列表)
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv',index=False)
if __name__ == "__main__":
START = 1916
STOP = 2022
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
代码运行但数据框为空。 在下面的 URL 中,YEAR 和 PAGE 都是动态的。 我想遍历两者并获取 table td 和(如果可能的话)acc 下的依赖数据。 date 并在每年的 year.csv 中提取结果。
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table > tbody > tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers > span.current + div.a'):
url = soup.select_one('div.pagenumbers > span.current + div.a')['href']
else:
break
会发生什么?
首先,总是看汤——这就是真相。
您在 while 循环的请求中缺少 headers,这会导致 403
错误并且 table 不 select 正确。
如何实现?
在 while 循环中为您的请求正确设置 headers:
html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
Select 您的行更具体 - 请注意 html:
中没有tbody
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
还要检查分页的 select 或:
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
例子
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url , headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
以防万一
pandas.read_html()
的替代解决方案在所有年份中迭代:
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')
data = []
for url in ['https://aviation-safety.net/'+a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
while True:
html = requests.get(url, headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.prettify())[0])
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current + a'):
url = 'https://aviation-safety.net/wikibase/dblist.php'+soup.select_one('div.pagenumbers span.current + a')['href']
else:
break
time.sleep(random.random())
df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)
我对您的脚本进行了一些更改,应该可以更轻松地进行调试和维护。它使用 pandas 使写入 CSV 更容易,并使用 concurrent.futures 来加快速度。如果你有问题告诉我,基本上每年都是同时抓取的,我抓取第一页来获取要抓取的页数,然后我遍历每个页面并解析 HTML。关键信息被放入字典中,然后添加到列表中(通过 pandas 更容易写入 csv,因为它基本上已经是一个数据框 - 字典列表)
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv',index=False)
if __name__ == "__main__":
START = 1916
STOP = 2022
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)