如何在 Scrapy 上同时对多个页面进行分页和解析
How to paginate and parse multiple pages concurrently on Scrapy
我正在尝试从一个房地产网站上抓取多个页面。我已成功抓取 URL 的第一页,但无法处理分页。我试图找到一个带有 'red' 的 class 标签并识别下一个兄弟。我相信这会得到下一页的响应,并继续一遍又一遍地做。我读了一些例子,人们编写了能够同时解析多个页面的代码。
是否可以进行 parallel/concurrent 解析?我希望能够尽可能快地解析 90 页,但不知道如何实现它。任何和所有赞赏的人都非常感激。谢谢。
进度更新 1:
我想通了为什么我的 CSV 在 Pycharm IDE 中正确输出 UTF-8 和 returns 西里尔字符,但是 returns ??我使用 Excel 时的占位符。通过 Excel Data>From Text/CSV.
导入 CSV 文件,我已经能够绕过这个问题
进度更新 2: 我知道我可以在我的 start_request 函数中实现一个 for 循环并循环页面 (1,90) 甚至 (1,120) 但是这不是我想要的,这将使我的代码逐页解析,而不是同时解析。
HTML 片段:
<ul class="number-list">
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1" class="page-number js-page-filter red" data-page="1">1</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=2" class="page-number js-page-filter " data-page="2">2</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=3" class="page-number js-page-filter " data-page="3">3</a>
</li>
<li><span class="page-number">...</span></li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=89" class="page-number js-page-filter " data-page="89">89</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=90" class="page-number js-page-filter " data-page="90">90</a>
</li>
<div class="clear"></div>
</ul>
分页代码段:
# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
完整代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today+' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
如果我理解正确,您需要将 'next page' 移动到解析函数。我也只是获取 'next page' 按钮值并跟随它。
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today+' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
# handling pagination
next_page = response.xpath('//a[contains(@class, "number-list-next js-page-filter number-list-line")]/@href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
这应该有效。
我正在尝试从一个房地产网站上抓取多个页面。我已成功抓取 URL 的第一页,但无法处理分页。我试图找到一个带有 'red' 的 class 标签并识别下一个兄弟。我相信这会得到下一页的响应,并继续一遍又一遍地做。我读了一些例子,人们编写了能够同时解析多个页面的代码。
是否可以进行 parallel/concurrent 解析?我希望能够尽可能快地解析 90 页,但不知道如何实现它。任何和所有赞赏的人都非常感激。谢谢。
进度更新 1: 我想通了为什么我的 CSV 在 Pycharm IDE 中正确输出 UTF-8 和 returns 西里尔字符,但是 returns ??我使用 Excel 时的占位符。通过 Excel Data>From Text/CSV.
导入 CSV 文件,我已经能够绕过这个问题进度更新 2: 我知道我可以在我的 start_request 函数中实现一个 for 循环并循环页面 (1,90) 甚至 (1,120) 但是这不是我想要的,这将使我的代码逐页解析,而不是同时解析。
HTML 片段:
<ul class="number-list">
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1" class="page-number js-page-filter red" data-page="1">1</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=2" class="page-number js-page-filter " data-page="2">2</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=3" class="page-number js-page-filter " data-page="3">3</a>
</li>
<li><span class="page-number">...</span></li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=89" class="page-number js-page-filter " data-page="89">89</a>
</li>
<li>
<a href="/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/?cities=1&page=90" class="page-number js-page-filter " data-page="90">90</a>
</li>
<div class="clear"></div>
</ul>
分页代码段:
# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
完整代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today+' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# handling pagination
next_page = response.xpath("//a[contains(@class,'red')]/parent::li/following-sibling::li/a/@href").extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
如果我理解正确,您需要将 'next page' 移动到解析函数。我也只是获取 'next page' 按钮值并跟随它。
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import unicodecsv as csv
from datetime import datetime
from scrapy.crawler import CrawlerProcess
dt_today = datetime.now().strftime('%Y%m%d')
file_name = dt_today+' HPI Data'
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = "unegui_apts"
allowed_domains = ["www.unegui.mn"]
custom_settings = {"FEEDS": {f'{file_name}.csv': {'format': 'csv'}}
}
def start_requests(self):
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/ulan-bator/']
for url in urls:
yield Request(url, self.parse)
def parse(self, response, **kwargs):
cards = response.xpath("//li[contains(@class,'announcement-container')]")
# parse details
for card in cards:
name = card.xpath(".//a[@itemprop='name']/@content").extract_first()
price = card.xpath(".//*[@itemprop='price']/@content").extract_first()
rooms = card.xpath(".//div[contains(@class,'announcement-block__breadcrumbs')]/text()").extract_first().split('»')[0].strip()
link = card.xpath(".//a[@itemprop='url']/@href").extract_first()
date_block = card.xpath("normalize-space(.//div[contains(@class,'announcement-block__date')]/text())").extract_first().split(',')
date = date_block[0].strip()
city = date_block[1].strip()
item = {'name': name,
'date': date,
'rooms': rooms,
'price': price,
'city': city,
}
# follow absolute link to scrape deeper level
yield response.follow(link, callback=self.parse_item, meta={'item': item})
# handling pagination
next_page = response.xpath('//a[contains(@class, "number-list-next js-page-filter number-list-line")]/@href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_item(self, response):
# retrieve previously scraped item between callbacks
item = response.meta['item']
# parse additional details
list_span = response.xpath(".//span[contains(@class,'value-chars')]//text()").extract()
list_a = response.xpath(".//a[contains(@class, 'value-chars')]//text()").extract()
# get additional details from list of <span> tags, element by element
floor_type = list_span[0].strip()
num_balcony = list_span[1].strip()
garage = list_span[2].strip()
window_type = list_span[3].strip()
door_type = list_span[4].strip()
num_window = list_span[5].strip()
# get additional details from list of <a> tags, element by element
commission_year = list_a[0].strip()
num_floors = list_a[1].strip()
area_sqm = list_a[2].strip()
floor = list_a[3].strip()
leasing = list_a[4].strip()
district = list_a[5].strip()
address = list_a[6].strip()
# update item with newly parsed data
item.update({
'district': district,
'address': address,
'area_sqm': area_sqm,
'floor': floor,
'commission_year': commission_year,
'num_floors': num_floors,
'num_windows': num_window,
'num_balcony': num_balcony,
'floor_type': floor_type,
'window_type': window_type,
'door_type': door_type,
'garage': garage,
'leasing': leasing
})
yield item
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
这应该有效。