我怎样才能增加 link
How can I increment the link
我有一个 link : https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm
我想像这样增加 link:https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP2.htm
然后是 3,4,5....
我的代码是:
# -*- coding: utf-8 -*-
import scrapy
class GlassdoorSpider(scrapy.Spider):
name = 'glassdoor'
#allowed_domains = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm']
def parse(self, response):
#main_url = "https://www.glassdoor.ca"
urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract()
for url in urls:
url = "https://www.glassdoor.ca" + url
yield scrapy.Request(url = url, callback = self.parse_details)
next_page_url = "https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP"
if next_page_url:
#next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url, callback = self.parse)
def parse_details(self,response):
yield{
'Job_Title' : response.css('div.header.cell.info > h2::text').extract()
}
self.log("reached22: "+ response.url)
我想在变量 next_page_url 中增加它。
您需要这种方式的 XPath 表达式
urls = response.xpath('//*[contains(@class,"next")]//@href')
试试吧,应该可以的。
您说得对,在页面源代码中没有在您检查页面时所在的同一位置找到它。但是,您可以在 <head>
as
下的页面源代码中看到它
<link rel="next" href="https://www.monster.ca/jobs/search/?q=data-analyst&page=2" />
您可以使用
提取它
next_page_link = response.xpath('//head/link[@rel="next"]/@href').extract_first()
获得第二页你能做到吗
import requests
headers = {
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://www.monster.ca/jobs/search/?q=data-analyst',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
}
#for the other page, you should change page number
params = (
('q', 'data-analyst'),
('page', '2'),
)
r = requests.get('https://www.monster.ca/jobs/search/', headers=headers, params=params)
print r.text
获取所有页面,你应该获取最后一页的页码,
for page_number in xrange(2, last_page):
#put page_number in params
更新 1
另一个解决方案
def start_requests(self):
request = Request("https://www.monster.ca/jobs/search/?q=data-analyst", callback=self.get_lastPage)
yield request
def get_lastPage(self,response):
headers = {
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://www.monster.ca/jobs/search/?q=data-analyst',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
}
last_page = response.css('input#totalPages::attr("value")').extract_first()
for last_page in xrange(2, int(last_page)):
link = "https://www.monster.ca/jobs/search/?q=data-analyst&page=" + str(last_page)
yield Request(link,
headers=headers,
callback=self.parse_product)
我有一个 link : https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm
我想像这样增加 link:https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP2.htm
然后是 3,4,5.... 我的代码是:
# -*- coding: utf-8 -*-
import scrapy
class GlassdoorSpider(scrapy.Spider):
name = 'glassdoor'
#allowed_domains = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm']
def parse(self, response):
#main_url = "https://www.glassdoor.ca"
urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract()
for url in urls:
url = "https://www.glassdoor.ca" + url
yield scrapy.Request(url = url, callback = self.parse_details)
next_page_url = "https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP"
if next_page_url:
#next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url, callback = self.parse)
def parse_details(self,response):
yield{
'Job_Title' : response.css('div.header.cell.info > h2::text').extract()
}
self.log("reached22: "+ response.url)
我想在变量 next_page_url 中增加它。
您需要这种方式的 XPath 表达式
urls = response.xpath('//*[contains(@class,"next")]//@href')
试试吧,应该可以的。
您说得对,在页面源代码中没有在您检查页面时所在的同一位置找到它。但是,您可以在 <head>
as
<link rel="next" href="https://www.monster.ca/jobs/search/?q=data-analyst&page=2" />
您可以使用
提取它next_page_link = response.xpath('//head/link[@rel="next"]/@href').extract_first()
获得第二页你能做到吗
import requests
headers = {
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://www.monster.ca/jobs/search/?q=data-analyst',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
}
#for the other page, you should change page number
params = (
('q', 'data-analyst'),
('page', '2'),
)
r = requests.get('https://www.monster.ca/jobs/search/', headers=headers, params=params)
print r.text
获取所有页面,你应该获取最后一页的页码,
for page_number in xrange(2, last_page):
#put page_number in params
更新 1
另一个解决方案
def start_requests(self):
request = Request("https://www.monster.ca/jobs/search/?q=data-analyst", callback=self.get_lastPage)
yield request
def get_lastPage(self,response):
headers = {
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://www.monster.ca/jobs/search/?q=data-analyst',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
}
last_page = response.css('input#totalPages::attr("value")').extract_first()
for last_page in xrange(2, int(last_page)):
link = "https://www.monster.ca/jobs/search/?q=data-analyst&page=" + str(last_page)
yield Request(link,
headers=headers,
callback=self.parse_product)