如何使用 scrapy-splash 进行分页

How to make Pagination using scrapy-splash

目的

我想使用 scrapy+splash 来抓取 https://www.livecoinwatch.com(我不想使用 selenium)。 但我不知道如何制作分页。我只能抓取第一页。

这是我的爬虫代码:

import scrapy

from scrapy_splash import SplashRequest
from coins.items import CoinsItem



class CoinsSpiderSpider(scrapy.Spider):
    name = 'coins_spider'
    allowed_domains = ['livecoinwatch.com']
    start_urls = ['https://www.livecoinwatch.com']

    Pages = 3

    lua_script = '''
        function main(splash, args)
                splash.private_mode_enabled = false
                url = args.url
           
                headers = {
                    ['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38'
                }
                splash:set_custom_headers(headers)
                assert(splash:go(url)) 
                assert(splash:wait(1)) 

                assert(splash:wait(5))
                splash:set_viewport_full() 

                return splash:html()
        end

    '''


def start_requests(self):
    for url in self.start_urls:
        yield SplashRequest(url = url,callback= self.parse,endpoint='execute',args={
            'lua_source':self.lua_script
        })

def parse(self, response):
    # 50 results in first page
    rows = response.xpath('//tr[@class="table-row filter-row"]')
    for row in rows:
        item = CoinsItem()

        item['coin'] = row.xpath('./td[2]//div[@class="item-name ml10"]/div/text()').extract_first()
        item['price'] = row.xpath('./td[3]').extract_first()
        item['marketCap'] = row.xpath('./td[4]/text()').extract_first()
        item['volumn24h'] = row.xpath('./td[5]/text()').extract_first()
        item['Liquidity'] = row.xpath('./td[6]/text()').extract_first()
        item['allTimeHigh'] = row.xpath('./td[7]/text()').extract_first()
        item['hour1_value'] = row.xpath('./td[8]/span/text()').extract_first()
        item['hour1_class'] = row.xpath('./td[8]/@class').extract_first()
        item['hour24_value'] = row.xpath('./td[9]/span/text()').extract_first()
        item['hour24_class'] = row.xpath('./td[9]/@class').extract_first()

        yield item

    # next page
    # do not know how to code!!!

使用直接请求 api 比使用 scrapy_splash 更容易抓取网站。当您单击底部的页面导航时检查 XHR 请求,您会注意到请求是针对 https://http-api.livecoinwatch.com/coins?offset=50&limit=50&sort=rank&order=ascending&currency=USD 发出的,其中 returns 是 json 响应。调整 offsetlimit 参数以限制返回的数据量。

请参阅下面的实施示例

import scrapy
from coins.items import CoinsItem

class CoinsSpiderSpider(scrapy.Spider):
    name = 'coins_spider'
    allowed_domains = ['livecoinwatch.com']
    # start from first item and fetch 500 items in each request. Modify as suits you
    offset = 0
    limit = 500
    start_urls = [f'https://http-api.livecoinwatch.com/coins?offset={offset}&limit={limit}&sort=rank&order=ascending&currency=USD'] 

    def parse(self, response):
        data = response.json()

        for coin in data['data']:
            item = CoinsItem()
            item['coin'] = coin.get('code')
            item['price'] = coin.get('price')
            item['marketCap'] = coin.get('cap')
            item['volum24h'] = coin.get('volume')
            #... check the json response and add the other fields you need

            yield item

        # yield next request
        self.offset += self.limit
        next_url = f'https://http-api.livecoinwatch.com/coins?offset={self.offset}&limit={self.limit}&sort=rank&order=ascending&currency=USD'
        yield scrapy.Request(next_url)