如何在最后一页(请求,python)上中断抓取?
How to break the crawl when it is on the last page (requests, python)?
我制作了一个带有请求的抓取程序,我想在它到达最后一页时停止它。我应该把 break 语句放在最后一页的哪里来打破循环?现在它运行但它不会在最后一页停止。我附上了程序。我将不胜感激任何帮助。
import requests
from lxml import html
from time import sleep
import csv
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
proxies = {
'http': 'http://95.167.116.116:8080',
'https': 'http://88.157.149.250:8080',
}
page_counter = 1
links = []
while True:
try:
url = "https://www.amazon.com/s/ref=sr_pg_{0}?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&page={0}&bbn=11444086011&ie=UTF8&qid=1517650207".format(
page_counter)
response = requests.get(url, headers=headers, proxies=proxies, stream=True)
if response.status_code == 200:
source = html.fromstring(response.content)
links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
page_counter += 1
else:
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
sleep(5)
print("Current page ", page_counter)
print("Was a nice sleep, now let me continue...")
csvfile = "products.csv"
# Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in links:
writer.writerow([val])
以这个片段为例,然后用你的自定义函数扩展它:
from time import sleep
from urllib.parse import urljoin
import requests
from lxml import html
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
proxies = {
'http': 'http://95.167.116.116:8080',
'https': 'http://88.157.149.250:8080',
}
links = []
url = 'https://www.amazon.com/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&bbn=11444086011&ie=UTF8&qid=1517831374'
while True:
try:
print('Fetching url [%s]...' % url)
response = requests.get(url, headers=headers, stream=True)
if response.status_code == 200:
source = html.fromstring(response.content)
links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
try:
next_url = source.xpath('//*[@id="pagnNextLink"]/@href')[0]
url = urljoin('https://www.amazon.com', next_url)
except IndexError:
break
except Exception:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
sleep(5)
print("Was a nice sleep, now let me continue...")
print(links)
实际上它是为下一页的 link 抓取当前页面。如果能找到下一页的url,则继续。如果找不到,则中断 while
循环,并打印收集到的 links
列表。
希望对您有所帮助。
我制作了一个带有请求的抓取程序,我想在它到达最后一页时停止它。我应该把 break 语句放在最后一页的哪里来打破循环?现在它运行但它不会在最后一页停止。我附上了程序。我将不胜感激任何帮助。
import requests
from lxml import html
from time import sleep
import csv
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
proxies = {
'http': 'http://95.167.116.116:8080',
'https': 'http://88.157.149.250:8080',
}
page_counter = 1
links = []
while True:
try:
url = "https://www.amazon.com/s/ref=sr_pg_{0}?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&page={0}&bbn=11444086011&ie=UTF8&qid=1517650207".format(
page_counter)
response = requests.get(url, headers=headers, proxies=proxies, stream=True)
if response.status_code == 200:
source = html.fromstring(response.content)
links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
page_counter += 1
else:
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
sleep(5)
print("Current page ", page_counter)
print("Was a nice sleep, now let me continue...")
csvfile = "products.csv"
# Assuming res is a flat list
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in links:
writer.writerow([val])
以这个片段为例,然后用你的自定义函数扩展它:
from time import sleep
from urllib.parse import urljoin
import requests
from lxml import html
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
proxies = {
'http': 'http://95.167.116.116:8080',
'https': 'http://88.157.149.250:8080',
}
links = []
url = 'https://www.amazon.com/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011%2Cn%3A11444072011%2Cn%3A11444086011%2Cn%3A2632268011&bbn=11444086011&ie=UTF8&qid=1517831374'
while True:
try:
print('Fetching url [%s]...' % url)
response = requests.get(url, headers=headers, stream=True)
if response.status_code == 200:
source = html.fromstring(response.content)
links.extend(source.xpath('//*[contains(@id,"result")]/div/div[3]/div[1]/a/@href'))
try:
next_url = source.xpath('//*[@id="pagnNextLink"]/@href')[0]
url = urljoin('https://www.amazon.com', next_url)
except IndexError:
break
except Exception:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
sleep(5)
print("Was a nice sleep, now let me continue...")
print(links)
实际上它是为下一页的 link 抓取当前页面。如果能找到下一页的url,则继续。如果找不到,则中断 while
循环,并打印收集到的 links
列表。
希望对您有所帮助。