正在抓取 asp.net 个包含多个/没有子页面的页面:在 if-else 语句中产生
Scraping asp.net page with several / no subpages: yield in if-else statement
这是文件 spyder.py:
import scrapy
from scrapy_spider.items import JobsItem
class JobSpider(scrapy.Spider):
name = 'burzarada'
start_urls = ['https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx']
download_delay = 1.5
def parse(self, response):
for href in response.css('div.NKZbox > div.KategorijeBox > a ::attr(href)').extract():
eventTarget = href.replace("javascript:__doPostBack('", "").replace("','')", "")
eventArgument = response.css('#__EVENTARGUMENT::attr(value)').extract()
lastFocus = response.css('#__LASTFOCUS::attr(value)').extract()
viewState = response.css('#__VIEWSTATE::attr(value)').extract()
viewStateGenerator = response.css('#__VIEWSTATEGENERATOR::attr(value)').extract()
viewStateEncrypted = response.css('#__VIEWSTATEENCRYPTED::attr(value)').extract()
yield scrapy.FormRequest(
'https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx',
formdata = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': eventArgument,
'__LASTFOCUS': lastFocus,
'__VIEWSTATE': viewState,
'__VIEWSTATEGENERATOR': viewStateGenerator,
'__VIEWSTATEENCRYPTED': viewStateEncrypted,
},
callback=self.parse_category
)
def parse_category(self, response):
href = response.xpath('//select[@id="ctl00_MainContent_ddlPageSize"]').extract()
eventTarget = "ctl00$MainContent$ddlPageSize"
eventArgument = response.css('#__EVENTARGUMENT::attr(value)').extract()
lastFocus = response.css('#__LASTFOCUS::attr(value)').extract()
viewState = response.css('#__VIEWSTATE::attr(value)').extract()
viewStateGenerator = response.css('#__VIEWSTATEGENERATOR::attr(value)').extract()
viewStateEncrypted = response.css('#__VIEWSTATEENCRYPTED::attr(value)').extract()
pageSize = '75'
sort = '0'
yield scrapy.FormRequest(
'https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx',
formdata = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': eventArgument,
'__LASTFOCUS': lastFocus,
'__VIEWSTATE': viewState,
'__VIEWSTATEGENERATOR': viewStateGenerator,
'__VIEWSTATEENCRYPTED': viewStateEncrypted,
'ctl00$MainContent$ddlPageSize': pageSize,
'ctl00$MainContent$ddlSort': sort,
},
callback=self.parse_multiple_pages
)
def parse_multiple_pages(self, response):
hrefs = response.xpath('//*[@id="ctl00_MainContent_gwSearch"]//tr[last()]//li/a/@href').extract()
##################################
# Here is the part of problem
if len(hrefs) != 0: # yield statement
for href in hrefs:
eventTarget = href.replace("javascript:__doPostBack('", "").replace("','')", "")
eventArgument = response.css('#__EVENTARGUMENT::attr(value)').extract()
lastFocus = response.css('#__LASTFOCUS::attr(value)').extract()
viewState = response.css('#__VIEWSTATE::attr(value)').extract()
viewStateGenerator = response.css('#__VIEWSTATEGENERATOR::attr(value)').extract()
viewStateEncrypted = response.css('#__VIEWSTATEENCRYPTED::attr(value)').extract()
pageSize = '75'
sort = '0'
print(eventTarget)
yield scrapy.FormRequest(
'https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx',
formdata = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': eventArgument,
'__LASTFOCUS': lastFocus,
'__VIEWSTATE': viewState,
'__VIEWSTATEGENERATOR': viewStateGenerator,
'__VIEWSTATEENCRYPTED': viewStateEncrypted,
'ctl00$MainContent$ddlPageSize': pageSize,
'ctl00$MainContent$ddlSort': sort,
},
callback=self.parse_links
)
else: # another yield
for link in links:
link = 'https://burzarada.hzz.hr/' + link
yield scrapy.Request(url=link, callback=self.parse_job)
##########################################
def parse_links(self, response):
links = response.xpath('//a[@class="TitleLink"]/@href').extract()
for link in links:
link = 'https://burzarada.hzz.hr/' + link
yield scrapy.Request(url=link, callback=self.parse_job)
def parse_job(self, response):
item = JobsItem()
item['url'] = ''
item['title'] = ''
item['workplace'] = ''
item['required_workers'] = ''
item['type_of_employment'] = ''
item['working_hours'] = ''
item['mode_of_operation'] = ''
item['accomodation'] = ''
item['transportation_fee'] = ''
item['start_date'] = ''
item['end_date'] = ''
item['education_level'] = ''
item['work_experience'] = ''
item['other_information'] = ''
item['employer'] = ''
item['contact'] = ''
item['driving_test'] = ''
yield item
可以看到页面结构不是很复杂
这是我要抓取的页面的link。
https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx
页面上有 16 个 hyperlink,每个都发布请求以获取列表中不同数量的职位。
第一个 link 有 <10 个项目,第二个有 >1000 个。作业列表的视口比例设置为 25,因此第一个 link 没有子页面,第二个 link 有 10+ 个子页面。
我设法将它们更改为 75,这样我就不必处理很多子页面。问题到了下一部分。
问题是,我无法在第一个 link(没有子页面的那个)上获得任何项目。仅从第二个 link(具有 10 个以上子页面的那个)开始抓取。我尝试在代码中使用多个 print() 来遵循流程(为简洁明了而删除),但我发现它从未触及 else: 那里的部分!
如果我只尝试第一页(在函数 parse() 中将 for 循环限制为 运行 一次),那么它工作正常。
我为此苦苦挣扎了几个小时,但找不到任何有用的答案。
我猜这是因为第一个 link 没有子页面。如果它有一些,那么我就不必在那里添加 if-else 了!
谁能帮帮我?
我已经启动了代码,它似乎运行正常。
Scraping only starts with the second link
它实际上尝试使用第一类。问题是 links
未定义,蜘蛛失败。异常 - NameError: name 'links' is not defined
。 Scrapy 可能会在解析页面时失败,但这不会停止整个爬虫,因此 Scrapy 会继续处理具有分页的页面。
您还可以在 spider 的第一个请求中包含分页和排序。在这种情况下,您可以通过删除 parse_category
.
来简化蜘蛛程序
还有这个选择器
hrefs = response.xpath('//*[@id="ctl00_MainContent_gwSearch"]//tr[last()]//li/a/@href').extract()
可以更简单:
hrefs = response.xpath('//ul[contains(@class, "pagination")]//a/@href').extract()
综上所述 code 可能会简单一些。
这是文件 spyder.py:
import scrapy
from scrapy_spider.items import JobsItem
class JobSpider(scrapy.Spider):
name = 'burzarada'
start_urls = ['https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx']
download_delay = 1.5
def parse(self, response):
for href in response.css('div.NKZbox > div.KategorijeBox > a ::attr(href)').extract():
eventTarget = href.replace("javascript:__doPostBack('", "").replace("','')", "")
eventArgument = response.css('#__EVENTARGUMENT::attr(value)').extract()
lastFocus = response.css('#__LASTFOCUS::attr(value)').extract()
viewState = response.css('#__VIEWSTATE::attr(value)').extract()
viewStateGenerator = response.css('#__VIEWSTATEGENERATOR::attr(value)').extract()
viewStateEncrypted = response.css('#__VIEWSTATEENCRYPTED::attr(value)').extract()
yield scrapy.FormRequest(
'https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx',
formdata = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': eventArgument,
'__LASTFOCUS': lastFocus,
'__VIEWSTATE': viewState,
'__VIEWSTATEGENERATOR': viewStateGenerator,
'__VIEWSTATEENCRYPTED': viewStateEncrypted,
},
callback=self.parse_category
)
def parse_category(self, response):
href = response.xpath('//select[@id="ctl00_MainContent_ddlPageSize"]').extract()
eventTarget = "ctl00$MainContent$ddlPageSize"
eventArgument = response.css('#__EVENTARGUMENT::attr(value)').extract()
lastFocus = response.css('#__LASTFOCUS::attr(value)').extract()
viewState = response.css('#__VIEWSTATE::attr(value)').extract()
viewStateGenerator = response.css('#__VIEWSTATEGENERATOR::attr(value)').extract()
viewStateEncrypted = response.css('#__VIEWSTATEENCRYPTED::attr(value)').extract()
pageSize = '75'
sort = '0'
yield scrapy.FormRequest(
'https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx',
formdata = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': eventArgument,
'__LASTFOCUS': lastFocus,
'__VIEWSTATE': viewState,
'__VIEWSTATEGENERATOR': viewStateGenerator,
'__VIEWSTATEENCRYPTED': viewStateEncrypted,
'ctl00$MainContent$ddlPageSize': pageSize,
'ctl00$MainContent$ddlSort': sort,
},
callback=self.parse_multiple_pages
)
def parse_multiple_pages(self, response):
hrefs = response.xpath('//*[@id="ctl00_MainContent_gwSearch"]//tr[last()]//li/a/@href').extract()
##################################
# Here is the part of problem
if len(hrefs) != 0: # yield statement
for href in hrefs:
eventTarget = href.replace("javascript:__doPostBack('", "").replace("','')", "")
eventArgument = response.css('#__EVENTARGUMENT::attr(value)').extract()
lastFocus = response.css('#__LASTFOCUS::attr(value)').extract()
viewState = response.css('#__VIEWSTATE::attr(value)').extract()
viewStateGenerator = response.css('#__VIEWSTATEGENERATOR::attr(value)').extract()
viewStateEncrypted = response.css('#__VIEWSTATEENCRYPTED::attr(value)').extract()
pageSize = '75'
sort = '0'
print(eventTarget)
yield scrapy.FormRequest(
'https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx',
formdata = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': eventArgument,
'__LASTFOCUS': lastFocus,
'__VIEWSTATE': viewState,
'__VIEWSTATEGENERATOR': viewStateGenerator,
'__VIEWSTATEENCRYPTED': viewStateEncrypted,
'ctl00$MainContent$ddlPageSize': pageSize,
'ctl00$MainContent$ddlSort': sort,
},
callback=self.parse_links
)
else: # another yield
for link in links:
link = 'https://burzarada.hzz.hr/' + link
yield scrapy.Request(url=link, callback=self.parse_job)
##########################################
def parse_links(self, response):
links = response.xpath('//a[@class="TitleLink"]/@href').extract()
for link in links:
link = 'https://burzarada.hzz.hr/' + link
yield scrapy.Request(url=link, callback=self.parse_job)
def parse_job(self, response):
item = JobsItem()
item['url'] = ''
item['title'] = ''
item['workplace'] = ''
item['required_workers'] = ''
item['type_of_employment'] = ''
item['working_hours'] = ''
item['mode_of_operation'] = ''
item['accomodation'] = ''
item['transportation_fee'] = ''
item['start_date'] = ''
item['end_date'] = ''
item['education_level'] = ''
item['work_experience'] = ''
item['other_information'] = ''
item['employer'] = ''
item['contact'] = ''
item['driving_test'] = ''
yield item
可以看到页面结构不是很复杂
这是我要抓取的页面的link。
https://burzarada.hzz.hr/Posloprimac_RadnaMjesta.aspx
页面上有 16 个 hyperlink,每个都发布请求以获取列表中不同数量的职位。
第一个 link 有 <10 个项目,第二个有 >1000 个。作业列表的视口比例设置为 25,因此第一个 link 没有子页面,第二个 link 有 10+ 个子页面。
我设法将它们更改为 75,这样我就不必处理很多子页面。问题到了下一部分。
问题是,我无法在第一个 link(没有子页面的那个)上获得任何项目。仅从第二个 link(具有 10 个以上子页面的那个)开始抓取。我尝试在代码中使用多个 print() 来遵循流程(为简洁明了而删除),但我发现它从未触及 else: 那里的部分!
如果我只尝试第一页(在函数 parse() 中将 for 循环限制为 运行 一次),那么它工作正常。
我为此苦苦挣扎了几个小时,但找不到任何有用的答案。
我猜这是因为第一个 link 没有子页面。如果它有一些,那么我就不必在那里添加 if-else 了!
谁能帮帮我?
我已经启动了代码,它似乎运行正常。
Scraping only starts with the second link
它实际上尝试使用第一类。问题是 links
未定义,蜘蛛失败。异常 - NameError: name 'links' is not defined
。 Scrapy 可能会在解析页面时失败,但这不会停止整个爬虫,因此 Scrapy 会继续处理具有分页的页面。
您还可以在 spider 的第一个请求中包含分页和排序。在这种情况下,您可以通过删除 parse_category
.
还有这个选择器
hrefs = response.xpath('//*[@id="ctl00_MainContent_gwSearch"]//tr[last()]//li/a/@href').extract()
可以更简单:
hrefs = response.xpath('//ul[contains(@class, "pagination")]//a/@href').extract()
综上所述 code 可能会简单一些。