如何在最后一页(请求,python)上中断抓取?

How to break the crawl when it is on the last page (requests, python)?

我制作了一个带有请求的抓取程序,我想在它到达最后一页时停止它。我应该把 break 语句放在最后一页的哪里来打破循环?现在它运行但它不会在最后一页停止。我附上了程序。我将不胜感激任何帮助。

import requests
from lxml import html
from time import sleep
import csv

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch, br",
    "Accept-Language": "en-US,en;q=0.8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}

proxies = {
    'http': 'http://95.167.116.116:8080',
    'https': 'http://88.157.149.250:8080',
}
page_counter = 1
links = []
while True:
        try:
            url = "https://www.amazon.com/s/ref=sr_pg_{0}?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&page={0}&bbn=11444086011&ie=UTF8&qid=1517650207".format(
                page_counter)
            response = requests.get(url, headers=headers, proxies=proxies, stream=True)
            if response.status_code == 200:
                source = html.fromstring(response.content)
                links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
                page_counter += 1
            else:
                break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            sleep(5)
            print("Current page ", page_counter)
            print("Was a nice sleep, now let me continue...")

csvfile = "products.csv"

# Assuming res is a flat list
with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in links:
        writer.writerow([val])

以这个片段为例,然后用你的自定义函数扩展它:

from time import sleep
from urllib.parse import urljoin

import requests
from lxml import html

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch, br",
    "Accept-Language": "en-US,en;q=0.8",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}

proxies = {
    'http': 'http://95.167.116.116:8080',
    'https': 'http://88.157.149.250:8080',
}

links = []
url = 'https://www.amazon.com/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&bbn=11444086011&ie=UTF8&qid=1517831374'

while True:
    try:
        print('Fetching url [%s]...' % url)
        response = requests.get(url, headers=headers, stream=True)
        if response.status_code == 200:
            source = html.fromstring(response.content)
            links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
            try:
                next_url = source.xpath('//*[@id="pagnNextLink"]/@href')[0]
                url = urljoin('https://www.amazon.com', next_url)
            except IndexError:
                break
    except Exception:
        print("Connection refused by the server..")
        print("Let me sleep for 5 seconds")
        print("ZZzzzz...")
        sleep(5)
        print("Was a nice sleep, now let me continue...")

print(links)

实际上它是为下一页的 link 抓取当前页面。如果能找到下一页的url,则继续。如果找不到,则中断 while 循环,并打印收集到的 links 列表。

希望对您有所帮助。