从具有 JSon 的网站抓取数据
crawling data from site having JSon
我想从网页 https://www.balticshipping.com/vessels including ship info from its detail pages such as https://www.balticshipping.com/vessel/imo/9331713 中抓取数据并将此数据保存到 CSV 表格中。在转到下一页时,我看到 URL 没有改变,所以我不知道如何同时从所有页面获取数据。有什么具体的方法可以将所有这些数据放在一个 CSV 文件中吗?
Next button inspection view
import requests
from bs4 import BeautifulSoup
baseurl ='https://www.balticshipping.com/'
headers= {'User-Agent': 'Mozilla/5.0'}
productlinks = []
response = requests.get('https://www.balticshipping.com/vessels')
soup = BeautifulSoup(response.content, 'html.parser')
productlist =soup.find_all('div', id="search_results")
#loop to get all href from ul
for item in productlist:
for link in item.find_all('a', href = True):
productlinks.append(baseurl + link['href'])
print(productlinks)
我试过这段代码从当前页面获取所有链接,但它给了我一个空的结果,可能在行 'productlist =soup.find_all('div', id=" search_results")',因为它没有通过使用 id 而不是 class.
获取任何数据
您可以通过 api 访问该数据。但请记住,您将迭代大约 7700 多页数据。
'year_build'
列在纪元中,表示自 1970 年 1 月 1 日以来的秒数。因此只需将其转换为时间戳,然后从该日期时间戳中提取年份。
对于类型和国家,我们只需要找到关联的id,然后可以创建一个字典,然后我们可以将id映射到相应的值。在这种情况下,我们从另一个请求中获取那些id和名称,只需要更改post表单数据中的参数即可。
import requests
import pandas as pd
url ='https://www.balticshipping.com/'
# Get Country and Type Ids and create a lookup dictionary
payload = {
'templates[]': ['modal_validation_errors:0',
'modal_email_verificate:0',
'r_vessel_types_multi:0',
'vessels_list:0',
'vessels:0'],
r'equest[0][module]': 'top_stat',
'request[0][action]': 'list',
'request[0][id]': '0',
'request[0][data]': '',
'request[0][sort]': '',
'request[0][limit]': '',
'request[0][stamp]': '0',
'dictionary[]': ['countrys:0',
'vessel_types:0']}
jsonData = requests.post(url, data=payload).json()
country_ids = pd.DataFrame(jsonData['data']['dictionary']['countrys']['array'])
country_ids_dict = dict(zip(country_ids['id'],country_ids['name']))
type_ids = pd.DataFrame(jsonData['data']['dictionary']['vessel_types']['array'])
type_ids_dict = dict(zip(type_ids['id'],type_ids['name']))
ships_found = True
page = 0
rows = []
#while ships_found:
for page in range(10):
payload = {
'request[0][module]': 'ships',
'request[0][action]': 'list',
'request[0][id]': '0',
'request[0][data][0][name]': 'search_id',
'request[0][data][0][value]': '0',
'request[0][data][1][name]': 'name',
'request[0][data][1][value]': '',
'request[0][data][2][name]': 'imo',
'request[0][data][2][value]': '',
'request[0][data][3][name]': 'page',
'request[0][data][3][value]': f'{page}',
'request[0][sort]': '',
'request[0][limit]': '27',
'request[0][stamp]': '0',
'request[1][module]': 'top_stat',
'request[1][action]': 'list',
'request[1][id]': '0',
'request[1][data]': '',
'request[1][sort]': '',
'request[1][limit]': '',
'request[1][stamp]': '0'}
jsonData = requests.post(url, data=payload).json()
if len(jsonData['data']['request'][0]['ships']) == 0:
ships_found = False
print('End of Pages.')
else:
for each in jsonData['data']['request'][0]['ships']:
row = each['data']
rows.append(row)
page += 1
print(page)
df = pd.DataFrame(rows)
# Convert the epoch to timestamp and pull out the year
df = df.rename(columns={'year_build':'epoch_year_build'})
df['year_build'] = pd.to_datetime(df['epoch_year_build'], unit='s').dt.year
# Use the lookup dictionaries to map the ids to corresponding names
df['country_name'] = df['flag_id'].map(country_ids_dict).fillna(df['flag_id'])
df['type_ship'] = df['type'].map(type_ids_dict).fillna(df['type'])
# SPLIT THE COLUMN
formerNames = df['former_names']
formerNames_merge = pd.DataFrame()
for idx, row in formerNames.iteritems():
try:
temp = pd.json_normalize(row)
temp.columns = ['formerNames_' + col for col in temp.columns]
temp.index = [idx] * len(temp)
except:
temp = pd.DataFrame()
formerNames_merge = formerNames_merge.append(temp)
df = pd.merge(df, formerNames_merge, how='left', left_index=True, right_index=True)
输出:
这里我只显示您谈到需要转换的列,但之前存在的所有数据仍然为您提供。
print(df[['id','name','type','type_ship','flag_id','country_name','epoch_year_build','year_build']].head(5).to_string())
id name type type_ship flag_id country_name epoch_year_build year_build
0 166434 RED DIAMOND 9 Bulk carrier 99 Liberia 1293840000 2011.0
1 190081 LEDRA 9 Bulk carrier 44 Cyprus 1356998400 2013.0
2 246691 MESTA 26 Crude oil tanker 111 Marshall Islands 1609459200 2021.0
3 246690 CHASELKA 26 Crude oil tanker 197 Hong Kong 1483228800 2017.0
4 172285 EVANS 9 Bulk carrier 111 Marshall Islands 1230768000 2009.0
@chitown88 解决方案有效,但据我所知,API 提供的数据不完整,所以我的解决方案是通过 IMO number
:
查询船舶详细信息
import requests
from bs4 import BeautifulSoup
baseurl ='https://www.balticshipping.com'
# Sample iteration from first page to page 10
for pagenum in range(10):
payload = {
'request[0][module]': 'ships',
'request[0][action]': 'list',
'request[0][id]': 0,
'request[0][data][0][name]': 'search_id',
'request[0][data][0][value]': 0,
'request[0][data][1][name]': 'name',
'request[0][data][1][value]': '',
'request[0][data][2][name]': 'imo',
'request[0][data][2][value]': '',
'request[0][data][3][name]': 'page',
'request[0][data][3][value]': pagenum,
'request[0][sort]': '',
'request[0][limit]': 9,
'request[0][stamp]': 0,
'request[1][module]': 'top_stat',
'request[1][action]': 'list',
'request[1][id]': 0,
'request[1][data]': '',
'request[1][sort]': '',
'request[1][limit]': '',
'request[1][stamp]': 0
}
response = requests.post(baseurl, data=payload)
try:
jsondata = response.json()
except requests.exceptions.JSONDecodeError:
print("Error occured while decoding JSON response.")
ships = jsondata['data']['request'][0]['ships']
for ship in ships:
ship_imo = ship['data']['imo']
print(ship_imo)
# Now query ship details by its IMO number
# just replace number in this example url: https://www.balticshipping.com/vessel/imo/9331713
# resp = requests.get('{baseurl}/vessel/imo/{ship_imo}'.format(baseurl=baseurl, ship_imo=ship_imo))
我想从网页 https://www.balticshipping.com/vessels including ship info from its detail pages such as https://www.balticshipping.com/vessel/imo/9331713 中抓取数据并将此数据保存到 CSV 表格中。在转到下一页时,我看到 URL 没有改变,所以我不知道如何同时从所有页面获取数据。有什么具体的方法可以将所有这些数据放在一个 CSV 文件中吗? Next button inspection view
import requests
from bs4 import BeautifulSoup
baseurl ='https://www.balticshipping.com/'
headers= {'User-Agent': 'Mozilla/5.0'}
productlinks = []
response = requests.get('https://www.balticshipping.com/vessels')
soup = BeautifulSoup(response.content, 'html.parser')
productlist =soup.find_all('div', id="search_results")
#loop to get all href from ul
for item in productlist:
for link in item.find_all('a', href = True):
productlinks.append(baseurl + link['href'])
print(productlinks)
我试过这段代码从当前页面获取所有链接,但它给了我一个空的结果,可能在行 'productlist =soup.find_all('div', id=" search_results")',因为它没有通过使用 id 而不是 class.
获取任何数据您可以通过 api 访问该数据。但请记住,您将迭代大约 7700 多页数据。
'year_build'
列在纪元中,表示自 1970 年 1 月 1 日以来的秒数。因此只需将其转换为时间戳,然后从该日期时间戳中提取年份。
对于类型和国家,我们只需要找到关联的id,然后可以创建一个字典,然后我们可以将id映射到相应的值。在这种情况下,我们从另一个请求中获取那些id和名称,只需要更改post表单数据中的参数即可。
import requests
import pandas as pd
url ='https://www.balticshipping.com/'
# Get Country and Type Ids and create a lookup dictionary
payload = {
'templates[]': ['modal_validation_errors:0',
'modal_email_verificate:0',
'r_vessel_types_multi:0',
'vessels_list:0',
'vessels:0'],
r'equest[0][module]': 'top_stat',
'request[0][action]': 'list',
'request[0][id]': '0',
'request[0][data]': '',
'request[0][sort]': '',
'request[0][limit]': '',
'request[0][stamp]': '0',
'dictionary[]': ['countrys:0',
'vessel_types:0']}
jsonData = requests.post(url, data=payload).json()
country_ids = pd.DataFrame(jsonData['data']['dictionary']['countrys']['array'])
country_ids_dict = dict(zip(country_ids['id'],country_ids['name']))
type_ids = pd.DataFrame(jsonData['data']['dictionary']['vessel_types']['array'])
type_ids_dict = dict(zip(type_ids['id'],type_ids['name']))
ships_found = True
page = 0
rows = []
#while ships_found:
for page in range(10):
payload = {
'request[0][module]': 'ships',
'request[0][action]': 'list',
'request[0][id]': '0',
'request[0][data][0][name]': 'search_id',
'request[0][data][0][value]': '0',
'request[0][data][1][name]': 'name',
'request[0][data][1][value]': '',
'request[0][data][2][name]': 'imo',
'request[0][data][2][value]': '',
'request[0][data][3][name]': 'page',
'request[0][data][3][value]': f'{page}',
'request[0][sort]': '',
'request[0][limit]': '27',
'request[0][stamp]': '0',
'request[1][module]': 'top_stat',
'request[1][action]': 'list',
'request[1][id]': '0',
'request[1][data]': '',
'request[1][sort]': '',
'request[1][limit]': '',
'request[1][stamp]': '0'}
jsonData = requests.post(url, data=payload).json()
if len(jsonData['data']['request'][0]['ships']) == 0:
ships_found = False
print('End of Pages.')
else:
for each in jsonData['data']['request'][0]['ships']:
row = each['data']
rows.append(row)
page += 1
print(page)
df = pd.DataFrame(rows)
# Convert the epoch to timestamp and pull out the year
df = df.rename(columns={'year_build':'epoch_year_build'})
df['year_build'] = pd.to_datetime(df['epoch_year_build'], unit='s').dt.year
# Use the lookup dictionaries to map the ids to corresponding names
df['country_name'] = df['flag_id'].map(country_ids_dict).fillna(df['flag_id'])
df['type_ship'] = df['type'].map(type_ids_dict).fillna(df['type'])
# SPLIT THE COLUMN
formerNames = df['former_names']
formerNames_merge = pd.DataFrame()
for idx, row in formerNames.iteritems():
try:
temp = pd.json_normalize(row)
temp.columns = ['formerNames_' + col for col in temp.columns]
temp.index = [idx] * len(temp)
except:
temp = pd.DataFrame()
formerNames_merge = formerNames_merge.append(temp)
df = pd.merge(df, formerNames_merge, how='left', left_index=True, right_index=True)
输出:
这里我只显示您谈到需要转换的列,但之前存在的所有数据仍然为您提供。
print(df[['id','name','type','type_ship','flag_id','country_name','epoch_year_build','year_build']].head(5).to_string())
id name type type_ship flag_id country_name epoch_year_build year_build
0 166434 RED DIAMOND 9 Bulk carrier 99 Liberia 1293840000 2011.0
1 190081 LEDRA 9 Bulk carrier 44 Cyprus 1356998400 2013.0
2 246691 MESTA 26 Crude oil tanker 111 Marshall Islands 1609459200 2021.0
3 246690 CHASELKA 26 Crude oil tanker 197 Hong Kong 1483228800 2017.0
4 172285 EVANS 9 Bulk carrier 111 Marshall Islands 1230768000 2009.0
@chitown88 解决方案有效,但据我所知,API 提供的数据不完整,所以我的解决方案是通过 IMO number
:
import requests
from bs4 import BeautifulSoup
baseurl ='https://www.balticshipping.com'
# Sample iteration from first page to page 10
for pagenum in range(10):
payload = {
'request[0][module]': 'ships',
'request[0][action]': 'list',
'request[0][id]': 0,
'request[0][data][0][name]': 'search_id',
'request[0][data][0][value]': 0,
'request[0][data][1][name]': 'name',
'request[0][data][1][value]': '',
'request[0][data][2][name]': 'imo',
'request[0][data][2][value]': '',
'request[0][data][3][name]': 'page',
'request[0][data][3][value]': pagenum,
'request[0][sort]': '',
'request[0][limit]': 9,
'request[0][stamp]': 0,
'request[1][module]': 'top_stat',
'request[1][action]': 'list',
'request[1][id]': 0,
'request[1][data]': '',
'request[1][sort]': '',
'request[1][limit]': '',
'request[1][stamp]': 0
}
response = requests.post(baseurl, data=payload)
try:
jsondata = response.json()
except requests.exceptions.JSONDecodeError:
print("Error occured while decoding JSON response.")
ships = jsondata['data']['request'][0]['ships']
for ship in ships:
ship_imo = ship['data']['imo']
print(ship_imo)
# Now query ship details by its IMO number
# just replace number in this example url: https://www.balticshipping.com/vessel/imo/9331713
# resp = requests.get('{baseurl}/vessel/imo/{ship_imo}'.format(baseurl=baseurl, ship_imo=ship_imo))