如何在splash/scrapy中进行分页?
How to do pagination in splash/scrapy?
我正在尝试从没有下一页 href 的网站获取项目 numbers.I 无法获取所有页面,只有一个 page.How 我可以修改我的代码以及为什么我无法获取所有项目我的代码?谁能解释一下?
import scrapy
from scrapy_splash import SplashRequest
class Hepsips4Spider(scrapy.Spider):
#sayfaları otomatikleştir geçişi yapabilirsin
name = 'hepsips4'
allowed_domains = ['www.hepsiburada.com']
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
'''
def start_requests(self):
yield SplashRequest(url="https://www.hepsiburada.com/playstation-4-oyunlari-c-60003893?siralama=enyeni&sayfa=1",callback=self.parse,endpoint="execute",args={'lua_source': self.script})
def parse(self, response):
for i in range(1,25):
yield SplashRequest(url=f"https://www.hepsiburada.com/playstation-4-oyunlari-c-60003893?siralama=enyeni&sayfa={i}",callback=self.parse_item,endpoint="execute",args={'lua_source': self.script})
def parse_item(self,response):
for row in response.xpath("//ul[@class='product-list results-container do-flex list']/li/div"):
name =row.xpath(".//a/div/h3/@title").get()
img =row.xpath(".//a/figure/div/img/@data-src").get()
company="hepsiburada"
link =row.xpath(".//a/@href").get()
fulllink=f"https://www.hepsiburada.com{link}"
price =row.xpath(".//a/div/div[2]/span/text()").get()
#sepet indirimini düzeltmeye çalış
platform='ps4'
yield{
'name': name,
'image': img,
'company': company,
'full_link': fulllink,
'price': price,
'platform': platform
}
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
'''
设置等待时间 2 解决了我的问题。
import scrapy
from scrapy_splash import SplashRequest
class JbPracujSpider(scrapy.Spider):
name = 'jb_pracuj'
allowed_domains = ['www.pracuj.pl']
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'Sec-Fetch-User': '?1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
}
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(5))
return {
html = splash:html(),
}
end
'''
def start_requests(self, headers=headers):
yield SplashRequest(url='https://www.pracuj.pl/praca?rd=30&cc=5015%2c5016&pn=1', callback=self.parse, endpoint='execute', args={'lua_source': self.script, 'wait': 1})
def parse(self, response):
for i in range(1,10):
yield SplashRequest(url=f'https://www.pracuj.pl/praca?rd=30&cc=5015%2c5016&pn={i}', callback=self.parse_item, endpoint='execute', args={'lua_source': self.script, 'wait': 1})
def parse_item(self, response):
for row in response.xpath("//li[@class='results__list-container-item']"):
yield {
'title': row.xpath(".//div[@class='offer-details__text']/h2/a[@class='offer-details__title-link']/text()|//div[@class='offer-details__text']/h2/button[@class='offer-details__title-link']/text()").get()
}
执行此代码后,我的 'title' 产量中有很多 None,但不是每个人。
我正在尝试从没有下一页 href 的网站获取项目 numbers.I 无法获取所有页面,只有一个 page.How 我可以修改我的代码以及为什么我无法获取所有项目我的代码?谁能解释一下?
import scrapy
from scrapy_splash import SplashRequest
class Hepsips4Spider(scrapy.Spider):
#sayfaları otomatikleştir geçişi yapabilirsin
name = 'hepsips4'
allowed_domains = ['www.hepsiburada.com']
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
'''
def start_requests(self):
yield SplashRequest(url="https://www.hepsiburada.com/playstation-4-oyunlari-c-60003893?siralama=enyeni&sayfa=1",callback=self.parse,endpoint="execute",args={'lua_source': self.script})
def parse(self, response):
for i in range(1,25):
yield SplashRequest(url=f"https://www.hepsiburada.com/playstation-4-oyunlari-c-60003893?siralama=enyeni&sayfa={i}",callback=self.parse_item,endpoint="execute",args={'lua_source': self.script})
def parse_item(self,response):
for row in response.xpath("//ul[@class='product-list results-container do-flex list']/li/div"):
name =row.xpath(".//a/div/h3/@title").get()
img =row.xpath(".//a/figure/div/img/@data-src").get()
company="hepsiburada"
link =row.xpath(".//a/@href").get()
fulllink=f"https://www.hepsiburada.com{link}"
price =row.xpath(".//a/div/div[2]/span/text()").get()
#sepet indirimini düzeltmeye çalış
platform='ps4'
yield{
'name': name,
'image': img,
'company': company,
'full_link': fulllink,
'price': price,
'platform': platform
}
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
'''
设置等待时间 2 解决了我的问题。
import scrapy
from scrapy_splash import SplashRequest
class JbPracujSpider(scrapy.Spider):
name = 'jb_pracuj'
allowed_domains = ['www.pracuj.pl']
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'Sec-Fetch-User': '?1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
}
script = '''
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(5))
return {
html = splash:html(),
}
end
'''
def start_requests(self, headers=headers):
yield SplashRequest(url='https://www.pracuj.pl/praca?rd=30&cc=5015%2c5016&pn=1', callback=self.parse, endpoint='execute', args={'lua_source': self.script, 'wait': 1})
def parse(self, response):
for i in range(1,10):
yield SplashRequest(url=f'https://www.pracuj.pl/praca?rd=30&cc=5015%2c5016&pn={i}', callback=self.parse_item, endpoint='execute', args={'lua_source': self.script, 'wait': 1})
def parse_item(self, response):
for row in response.xpath("//li[@class='results__list-container-item']"):
yield {
'title': row.xpath(".//div[@class='offer-details__text']/h2/a[@class='offer-details__title-link']/text()|//div[@class='offer-details__text']/h2/button[@class='offer-details__title-link']/text()").get()
}
执行此代码后,我的 'title' 产量中有很多 None,但不是每个人。