如何使用 scrapy-splash 进行分页
How to make Pagination using scrapy-splash
目的
我想使用 scrapy+splash 来抓取 https://www.livecoinwatch.com(我不想使用 selenium)。
但我不知道如何制作分页。我只能抓取第一页。
- 我想知道如何在splash中制作分页(lua)
- 可能吗?
- 当单击下一页按钮时,url 不会改变。
这是我的爬虫代码:
import scrapy
from scrapy_splash import SplashRequest
from coins.items import CoinsItem
class CoinsSpiderSpider(scrapy.Spider):
name = 'coins_spider'
allowed_domains = ['livecoinwatch.com']
start_urls = ['https://www.livecoinwatch.com']
Pages = 3
lua_script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
headers = {
['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38'
}
splash:set_custom_headers(headers)
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:wait(5))
splash:set_viewport_full()
return splash:html()
end
'''
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url = url,callback= self.parse,endpoint='execute',args={
'lua_source':self.lua_script
})
def parse(self, response):
# 50 results in first page
rows = response.xpath('//tr[@class="table-row filter-row"]')
for row in rows:
item = CoinsItem()
item['coin'] = row.xpath('./td[2]//div[@class="item-name ml10"]/div/text()').extract_first()
item['price'] = row.xpath('./td[3]').extract_first()
item['marketCap'] = row.xpath('./td[4]/text()').extract_first()
item['volumn24h'] = row.xpath('./td[5]/text()').extract_first()
item['Liquidity'] = row.xpath('./td[6]/text()').extract_first()
item['allTimeHigh'] = row.xpath('./td[7]/text()').extract_first()
item['hour1_value'] = row.xpath('./td[8]/span/text()').extract_first()
item['hour1_class'] = row.xpath('./td[8]/@class').extract_first()
item['hour24_value'] = row.xpath('./td[9]/span/text()').extract_first()
item['hour24_class'] = row.xpath('./td[9]/@class').extract_first()
yield item
# next page
# do not know how to code!!!
使用直接请求 api 比使用 scrapy_splash
更容易抓取网站。当您单击底部的页面导航时检查 XHR 请求,您会注意到请求是针对 https://http-api.livecoinwatch.com/coins?offset=50&limit=50&sort=rank&order=ascending¤cy=USD
发出的,其中 returns 是 json 响应。调整 offset
和 limit
参数以限制返回的数据量。
请参阅下面的实施示例
import scrapy
from coins.items import CoinsItem
class CoinsSpiderSpider(scrapy.Spider):
name = 'coins_spider'
allowed_domains = ['livecoinwatch.com']
# start from first item and fetch 500 items in each request. Modify as suits you
offset = 0
limit = 500
start_urls = [f'https://http-api.livecoinwatch.com/coins?offset={offset}&limit={limit}&sort=rank&order=ascending¤cy=USD']
def parse(self, response):
data = response.json()
for coin in data['data']:
item = CoinsItem()
item['coin'] = coin.get('code')
item['price'] = coin.get('price')
item['marketCap'] = coin.get('cap')
item['volum24h'] = coin.get('volume')
#... check the json response and add the other fields you need
yield item
# yield next request
self.offset += self.limit
next_url = f'https://http-api.livecoinwatch.com/coins?offset={self.offset}&limit={self.limit}&sort=rank&order=ascending¤cy=USD'
yield scrapy.Request(next_url)
目的
我想使用 scrapy+splash 来抓取 https://www.livecoinwatch.com(我不想使用 selenium)。
但我不知道如何制作分页。我只能抓取第一页。
- 我想知道如何在splash中制作分页(lua)
- 可能吗?
- 当单击下一页按钮时,url 不会改变。
这是我的爬虫代码:
import scrapy
from scrapy_splash import SplashRequest
from coins.items import CoinsItem
class CoinsSpiderSpider(scrapy.Spider):
name = 'coins_spider'
allowed_domains = ['livecoinwatch.com']
start_urls = ['https://www.livecoinwatch.com']
Pages = 3
lua_script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
headers = {
['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38'
}
splash:set_custom_headers(headers)
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:wait(5))
splash:set_viewport_full()
return splash:html()
end
'''
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url = url,callback= self.parse,endpoint='execute',args={
'lua_source':self.lua_script
})
def parse(self, response):
# 50 results in first page
rows = response.xpath('//tr[@class="table-row filter-row"]')
for row in rows:
item = CoinsItem()
item['coin'] = row.xpath('./td[2]//div[@class="item-name ml10"]/div/text()').extract_first()
item['price'] = row.xpath('./td[3]').extract_first()
item['marketCap'] = row.xpath('./td[4]/text()').extract_first()
item['volumn24h'] = row.xpath('./td[5]/text()').extract_first()
item['Liquidity'] = row.xpath('./td[6]/text()').extract_first()
item['allTimeHigh'] = row.xpath('./td[7]/text()').extract_first()
item['hour1_value'] = row.xpath('./td[8]/span/text()').extract_first()
item['hour1_class'] = row.xpath('./td[8]/@class').extract_first()
item['hour24_value'] = row.xpath('./td[9]/span/text()').extract_first()
item['hour24_class'] = row.xpath('./td[9]/@class').extract_first()
yield item
# next page
# do not know how to code!!!
使用直接请求 api 比使用 scrapy_splash
更容易抓取网站。当您单击底部的页面导航时检查 XHR 请求,您会注意到请求是针对 https://http-api.livecoinwatch.com/coins?offset=50&limit=50&sort=rank&order=ascending¤cy=USD
发出的,其中 returns 是 json 响应。调整 offset
和 limit
参数以限制返回的数据量。
请参阅下面的实施示例
import scrapy
from coins.items import CoinsItem
class CoinsSpiderSpider(scrapy.Spider):
name = 'coins_spider'
allowed_domains = ['livecoinwatch.com']
# start from first item and fetch 500 items in each request. Modify as suits you
offset = 0
limit = 500
start_urls = [f'https://http-api.livecoinwatch.com/coins?offset={offset}&limit={limit}&sort=rank&order=ascending¤cy=USD']
def parse(self, response):
data = response.json()
for coin in data['data']:
item = CoinsItem()
item['coin'] = coin.get('code')
item['price'] = coin.get('price')
item['marketCap'] = coin.get('cap')
item['volum24h'] = coin.get('volume')
#... check the json response and add the other fields you need
yield item
# yield next request
self.offset += self.limit
next_url = f'https://http-api.livecoinwatch.com/coins?offset={self.offset}&limit={self.limit}&sort=rank&order=ascending¤cy=USD'
yield scrapy.Request(next_url)