Scrapy-Selenium 分页
Scrapy-Selenium Pagination
谁能帮帮我?我正在练习,我不明白我在分页上做错了什么!它对我来说只是 returns 第一页,有时会出现错误。当它工作时,它只是 returns 第一页。
“内容安全策略指令 'frame-src' 的来源列表包含无效来源 '*trackcmp.net' 它将被忽略”,来源:https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)
问题是你的 xpath 选择器 returns None
而不是下一个页码。考虑将其更改为
next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
至
next_page = response.xpath("//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()
谁能帮帮我?我正在练习,我不明白我在分页上做错了什么!它对我来说只是 returns 第一页,有时会出现错误。当它工作时,它只是 returns 第一页。
“内容安全策略指令 'frame-src' 的来源列表包含无效来源 '*trackcmp.net' 它将被忽略”,来源:https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)
问题是你的 xpath 选择器 returns None
而不是下一个页码。考虑将其更改为
next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
至
next_page = response.xpath("//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using
scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()