当第一页 link 与其他页不同时,使用 BeautifulSoup 抓取多个网页
Scraping multiple web pages with BeautifulSoup when the first page link is different from others
我正在尝试抓取这个 page!对于我正在从事的项目。我想获取所有页面上每辆车的详细信息(价格、里程、变速箱和车龄)。我在下面的代码中遇到的问题是:
- 第一页link与其他页不同(无页码1<&page=1>)
- 汽车价格不在table点击每个广告后获取详情。
我想知道是否有人愿意帮我调查一下并提供建议。谢谢
from bs4 import BeautifulSoup
import requests
import urllib.parse
import csv
# the Toyota Camry model page is used
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
carLinks = set()
pageLinks = set()
data_set = []
parsed = urllib.parse.urlparse(soup.select('a')[0].get('href'))
nbPage = urllib.parse.parse_qs(parsed.query)['page'][1]
print("There are " + str(nbPage) + " web pages to process")
# for each web page that contains a grid of car offers
for i in range(1, int(nbPage), 1):
print("Processing web page: " + str(i))
# each car offer link is saved into the carLinks
for link in soup.select('#listContainer > div > section > div > tr > a'):
carLinks.add(link.get('href').replace("//", "http://"))
# the next url page is set
url = "https://www.olx.com.ng/vehicles/cars/toyota/?
search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + "
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
#for each car link
for carLink in carLinks:
print("Processing car page: " + carLink)
# we load the car page
r = requests.get(carLink)
data = r.text
soup = BeautifulSoup(data, "html.parser")
km = 0
transmission = ""
age = 0
price = 0
# for each attribute of the car
for info in soup.select("table.item tr div.pricelabel"):
# we keep the ones that we need
if info.select('.item')[0].text == u'Mileage':
km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", ""))
if info.select('.item')[0].text == u'Transmission':
transmission = info.select('.value')[0].text
if info.select('.item')[0].text == u'Year':
age = 2017 - int(info.select('.value')[0].text)
if info.select('.pricelabel')[0].text == u'Price':
price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", ""))
# each car is an array of four features added to the data_set
data_set.append([km, transmission, age, price])
# the data_set is save into the CSV file
fl = open('car_features.csv', 'w')
writer = csv.writer(fl)
writer.writerow(['km', 'transmission', 'age', 'price'])
for values in data_set:
writer.writerow(values)
fl.close()
网站严重损坏,如果你继续转到下一页,你最终会在浏览器上遇到加载循环,对我来说,如果我粘贴,页面 501 会返回到 500直接使用它,如果我在可以看到 501 的地方使用 500 中的下一个,我会得到一个永不结束的加载循环,我们使用重定向回到上一个来终止我们的循环。
我也用过lxml.html with cssselect, you can use bs4
if preferred, the logic is the same, but I would highly encourage using lxml
, the deps are outlined here,你还需要pip install cssselect
:
import requests
from lxml import html
from typing import Iterator
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
def parse_data(node_: html.Element, price: str) -> dict:
"""Parses the details section per individual car details page and returns a dict"""
# Price we pulled from main page.
details = {"price": price.strip("₦ ")}
# Details are in a table.
details_table = node_.cssselect("table.details")[0]
# The th has the description, the anchor has the value.
# we lower case and join the description, i.e "Type of car" -> type_of_car.
data = iter(details_table.cssselect("tr th, .value"))
details.update(("_".join(th.text.lower().split()), "".join(td.xpath(".//text()")).strip())
for th, td in (zip(data, data)))
return details
def get_link_and_price(s: requests.Session, node_: html.Element) -> Iterator[dict]:
"""Gets the link and the associated price from each tr."""
for child in node_.cssselect("table.offers td.offer"):
link, price = child.cssselect("a.link")[0].get("href"), child.cssselect(".price strong")[0].text
yield (parse_data(html.fromstring(s.get(link).content), price))
def start_request(url: str):
with requests.Session() as s:
get_ = s.get(url)
node = html.fromstring(get_.content)
# yield from subsequent iterators, i.e a dict of details.
yield from get_link_and_price(s, node)
# The site is broken, you click the next page button,
# and eventually you get stuck in a loading loop.
# At some stage the next should disappear,
# or only go back but it is wrongly implemented.
# This will stop when we try a next page,
# and end up back at the current url i.e ?page=501 -> ?page=500.
current_url = get_.url
next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
get_next = s.get(next_page)
node = html.fromstring(get_next.content)
# Keep going through pages till our break condition is met.
while current_url != get_next.url:
node = html.fromstring(get_next.content)
yield from get_link_and_price(s, node)
current_url = get_next.url
next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
get_next = s.get(next_page)
for dict_ in start_request(url):
print(dict_)
输出片段:
{'price': '3,300,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '65000'}
{'price': '3,000,000', 'offer_from': 'Individual', 'year': '2007', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '86500'}
{'price': '4,200,000', 'offer_from': 'Individual', 'year': '2013', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '39011'}
{'price': '4,500,000', 'offer_from': 'Business', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '93000'}
{'price': '890,000', 'offer_from': 'Business', 'year': '2001', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '110000'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2005', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,500,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,150,000', 'offer_from': 'Individual', 'year': '2002', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '167'}
{'price': '2,200,000', 'offer_from': 'Individual', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '24689'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2004', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '92,000'}
我正在尝试抓取这个 page!对于我正在从事的项目。我想获取所有页面上每辆车的详细信息(价格、里程、变速箱和车龄)。我在下面的代码中遇到的问题是:
- 第一页link与其他页不同(无页码1<&page=1>)
- 汽车价格不在table点击每个广告后获取详情。
我想知道是否有人愿意帮我调查一下并提供建议。谢谢
from bs4 import BeautifulSoup
import requests
import urllib.parse
import csv
# the Toyota Camry model page is used
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
carLinks = set()
pageLinks = set()
data_set = []
parsed = urllib.parse.urlparse(soup.select('a')[0].get('href'))
nbPage = urllib.parse.parse_qs(parsed.query)['page'][1]
print("There are " + str(nbPage) + " web pages to process")
# for each web page that contains a grid of car offers
for i in range(1, int(nbPage), 1):
print("Processing web page: " + str(i))
# each car offer link is saved into the carLinks
for link in soup.select('#listContainer > div > section > div > tr > a'):
carLinks.add(link.get('href').replace("//", "http://"))
# the next url page is set
url = "https://www.olx.com.ng/vehicles/cars/toyota/?
search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + "
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
#for each car link
for carLink in carLinks:
print("Processing car page: " + carLink)
# we load the car page
r = requests.get(carLink)
data = r.text
soup = BeautifulSoup(data, "html.parser")
km = 0
transmission = ""
age = 0
price = 0
# for each attribute of the car
for info in soup.select("table.item tr div.pricelabel"):
# we keep the ones that we need
if info.select('.item')[0].text == u'Mileage':
km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", ""))
if info.select('.item')[0].text == u'Transmission':
transmission = info.select('.value')[0].text
if info.select('.item')[0].text == u'Year':
age = 2017 - int(info.select('.value')[0].text)
if info.select('.pricelabel')[0].text == u'Price':
price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", ""))
# each car is an array of four features added to the data_set
data_set.append([km, transmission, age, price])
# the data_set is save into the CSV file
fl = open('car_features.csv', 'w')
writer = csv.writer(fl)
writer.writerow(['km', 'transmission', 'age', 'price'])
for values in data_set:
writer.writerow(values)
fl.close()
网站严重损坏,如果你继续转到下一页,你最终会在浏览器上遇到加载循环,对我来说,如果我粘贴,页面 501 会返回到 500直接使用它,如果我在可以看到 501 的地方使用 500 中的下一个,我会得到一个永不结束的加载循环,我们使用重定向回到上一个来终止我们的循环。
我也用过lxml.html with cssselect, you can use bs4
if preferred, the logic is the same, but I would highly encourage using lxml
, the deps are outlined here,你还需要pip install cssselect
:
import requests
from lxml import html
from typing import Iterator
url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
def parse_data(node_: html.Element, price: str) -> dict:
"""Parses the details section per individual car details page and returns a dict"""
# Price we pulled from main page.
details = {"price": price.strip("₦ ")}
# Details are in a table.
details_table = node_.cssselect("table.details")[0]
# The th has the description, the anchor has the value.
# we lower case and join the description, i.e "Type of car" -> type_of_car.
data = iter(details_table.cssselect("tr th, .value"))
details.update(("_".join(th.text.lower().split()), "".join(td.xpath(".//text()")).strip())
for th, td in (zip(data, data)))
return details
def get_link_and_price(s: requests.Session, node_: html.Element) -> Iterator[dict]:
"""Gets the link and the associated price from each tr."""
for child in node_.cssselect("table.offers td.offer"):
link, price = child.cssselect("a.link")[0].get("href"), child.cssselect(".price strong")[0].text
yield (parse_data(html.fromstring(s.get(link).content), price))
def start_request(url: str):
with requests.Session() as s:
get_ = s.get(url)
node = html.fromstring(get_.content)
# yield from subsequent iterators, i.e a dict of details.
yield from get_link_and_price(s, node)
# The site is broken, you click the next page button,
# and eventually you get stuck in a loading loop.
# At some stage the next should disappear,
# or only go back but it is wrongly implemented.
# This will stop when we try a next page,
# and end up back at the current url i.e ?page=501 -> ?page=500.
current_url = get_.url
next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
get_next = s.get(next_page)
node = html.fromstring(get_next.content)
# Keep going through pages till our break condition is met.
while current_url != get_next.url:
node = html.fromstring(get_next.content)
yield from get_link_and_price(s, node)
current_url = get_next.url
next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
get_next = s.get(next_page)
for dict_ in start_request(url):
print(dict_)
输出片段:
{'price': '3,300,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '65000'}
{'price': '3,000,000', 'offer_from': 'Individual', 'year': '2007', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '86500'}
{'price': '4,200,000', 'offer_from': 'Individual', 'year': '2013', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '39011'}
{'price': '4,500,000', 'offer_from': 'Business', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '93000'}
{'price': '890,000', 'offer_from': 'Business', 'year': '2001', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '110000'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2005', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,500,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,150,000', 'offer_from': 'Individual', 'year': '2002', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '167'}
{'price': '2,200,000', 'offer_from': 'Individual', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '24689'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2004', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '92,000'}