当第一页 link 与其他页不同时，使用 BeautifulSoup 抓取多个网页

Question

我正在尝试抓取这个 page！对于我正在从事的项目。我想获取所有页面上每辆车的详细信息（价格、里程、变速箱和车龄）。我在下面的代码中遇到的问题是：

第一页link与其他页不同（无页码1<&page=1>）
汽车价格不在table点击每个广告后获取详情。

我想知道是否有人愿意帮我调查一下并提供建议。谢谢

    from bs4 import BeautifulSoup
    import requests
    import urllib.parse
    import csv

    # the Toyota Camry model page is used
    url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"
    r = requests.get(url)
    data = r.text

    soup = BeautifulSoup(data, "html.parser")
    carLinks = set()
    pageLinks = set()
    data_set = []

    parsed = urllib.parse.urlparse(soup.select('a')[0].get('href'))
    nbPage = urllib.parse.parse_qs(parsed.query)['page'][1]
    print("There are " + str(nbPage) + " web pages to process")

    # for each web page that contains a grid of car offers
    for i in range(1, int(nbPage), 1):

    print("Processing web page: " + str(i))

    # each car offer link is saved into the carLinks
    for link in soup.select('#listContainer > div > section > div > tr > a'):
    carLinks.add(link.get('href').replace("//", "http://"))

    # the next url page is set
    url = "https://www.olx.com.ng/vehicles/cars/toyota/?
    search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry&page= + str(i) + "
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    #for each car link
    for carLink in carLinks:

    print("Processing car page: " + carLink)

    # we load the car page
    r = requests.get(carLink)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    km = 0
    transmission = ""
    age = 0
    price = 0

    # for each attribute of the car
    for info in soup.select("table.item tr div.pricelabel"):

    # we keep the ones that we need
    if info.select('.item')[0].text == u'Mileage':
        km = int(info.select('.value')[0].text.replace(" ", "").replace("KM", ""))
    if info.select('.item')[0].text == u'Transmission':
        transmission = info.select('.value')[0].text
    if info.select('.item')[0].text == u'Year':
        age = 2017 - int(info.select('.value')[0].text)
    if info.select('.pricelabel')[0].text == u'Price':
        price = int(info.select('.pricelabel')[0].text.replace(" ", "").replace(u"₦", ""))

    # each car is an array of four features added to the data_set
    data_set.append([km, transmission, age, price])

    # the data_set is save into the CSV file
    fl = open('car_features.csv', 'w')
    writer = csv.writer(fl)
    writer.writerow(['km', 'transmission', 'age', 'price'])
    for values in data_set:
    writer.writerow(values)

    fl.close()

Answer 1

网站严重损坏，如果你继续转到下一页，你最终会在浏览器上遇到加载循环，对我来说，如果我粘贴，页面 501 会返回到 500直接使用它，如果我在可以看到 501 的地方使用 500 中的下一个，我会得到一个永不结束的加载循环，我们使用重定向回到上一个来终止我们的循环。

我也用过lxml.html with cssselect, you can use bs4 if preferred, the logic is the same, but I would highly encourage using lxml, the deps are outlined here，你还需要pip install cssselect:

import requests
from lxml import html
from typing import Iterator

url = "https://www.olx.com.ng/vehicles/cars/toyota/?search%5Bfilter_enum_model%5D%5B0%5D=toyota%2Fcamry"


def parse_data(node_: html.Element, price: str) -> dict:
    """Parses the details section per individual car details page and returns a dict"""

    # Price we pulled from main page.
    details = {"price": price.strip("₦ ")}

    # Details are in a table.
    details_table = node_.cssselect("table.details")[0]

    # The th has the description, the anchor has the value.
    # we lower case and join the description, i.e "Type of car" -> type_of_car.
    data = iter(details_table.cssselect("tr th, .value"))
    details.update(("_".join(th.text.lower().split()), "".join(td.xpath(".//text()")).strip())
                   for th, td in (zip(data, data)))
    return details


def get_link_and_price(s: requests.Session, node_: html.Element) -> Iterator[dict]:
    """Gets the link and the associated price from each tr."""

    for child in node_.cssselect("table.offers td.offer"):
        link, price = child.cssselect("a.link")[0].get("href"), child.cssselect(".price strong")[0].text

        yield (parse_data(html.fromstring(s.get(link).content), price))


def start_request(url: str):
    with requests.Session() as s:
        get_ = s.get(url)
        node = html.fromstring(get_.content)

        # yield from subsequent iterators, i.e a dict of details.
        yield from get_link_and_price(s, node)

        # The site is broken, you click the next page button,
        #  and eventually you get stuck in a loading loop.
        # At some stage the next should disappear,
        # or only go back but it is wrongly implemented.
        # This will stop when we try a next page,
        #  and end up back at the current url i.e ?page=501 -> ?page=500.
        current_url = get_.url
        next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
        get_next = s.get(next_page)
        node = html.fromstring(get_next.content)

        # Keep going through pages till our break condition is met.
        while current_url != get_next.url:
            node = html.fromstring(get_next.content)
            yield from get_link_and_price(s, node)
            current_url = get_next.url
            next_page = node.cssselect("a.pageNextPrev")[-1].get("href")
            get_next = s.get(next_page)

for dict_ in start_request(url):
    print(dict_)

输出片段：

{'price': '3,300,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '65000'}
{'price': '3,000,000', 'offer_from': 'Individual', 'year': '2007', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '86500'}
{'price': '4,200,000', 'offer_from': 'Individual', 'year': '2013', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '39011'}
{'price': '4,500,000', 'offer_from': 'Business', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '93000'}
{'price': '890,000', 'offer_from': 'Business', 'year': '2001', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '110000'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2005', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,500,000', 'offer_from': 'Individual', 'year': '2008', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': 'Low'}
{'price': '1,150,000', 'offer_from': 'Individual', 'year': '2002', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '167'}
{'price': '2,200,000', 'offer_from': 'Individual', 'year': '2010', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '24689'}
{'price': '1,050,000', 'offer_from': 'Individual', 'year': '2004', 'transmission': 'Automatic', 'model': 'Camry', 'type_of_car': '4 door', 'mileage': '92,000'}

当第一页 link 与其他页不同时，使用 BeautifulSoup 抓取多个网页

Scraping multiple web pages with BeautifulSoup when the first page link is different from others

python

web-scraping

beautifulsoup

web-crawler

urllib