通过 Scrapy-Splash 将真实的 URL 传递给字典
Passing real URL through Scrapy-Splash to dictionary
当尝试通过 ('url' : response.request.url) 在字典中保存 URLs 时,Scrapy 从 Scrapy-Splash 中保存 URLs 都是一样的 ( http://localhost:8050/render.html)
我试过添加额外的参数,这些参数会传递真实的 URL 但无济于事。
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
from scrapy import Request
import scrapy
from scrapy_splash import SplashRequest
class QuotesJSSpider(scrapy.Spider):
name = 'quotesjs'
start_urls = ('https://www.facebook.com/login',)
custom_settings = {
'SPLASH_URL': 'http://localhost:8050',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
}
def parse(self, response):
token = response.xpath('//*[@id="u_0_a"]').extract_first()
return FormRequest.from_response(response,
formdata={'lgndim' : token,
'pass': 'xxx',
'email': 'xxxx'},
callback=self.load_sites)
def load_sites(self, response):
urls = [
'https://www.facebook.com/page1/about',
'https://www.facebook.com/page2/about',
]
for url in urls:
yield SplashRequest(url=url, callback=self.scrape_pages)
def scrape_pages(self, response):
shops = {
'company_name' : response.css('title::text').extract(),
'url' : response.request.url,
}
yield shops
结果应该是这样的:
'url' : https://www.facebook.com/page1/about'
而不是这个:
'url' : http://localhost:8050/render.html,
原始请求 url 可在此处获得:response.request._original_url
。
为避免必须访问内部属性,您还可以尝试:
- 传递元中的 url:
def load_sites(self, response):
urls = [
'https://www.facebook.com/page1/about',
'https://www.facebook.com/page2/about',
]
for url in urls:
yield SplashRequest(url=url, callback=self.scrape_pages, meta={'original_url': url})
def scrape_pages(self, response)
shops = {
'company_name' : response.css('title::text').extract(),
'url' : response.meta['original_url'],
}
yield shops
- 使用响应中的 url:
def scrape_pages(self, response):
shops = {
'company_name' : response.css('title::text').extract(),
'url' : response.url,
}
当尝试通过 ('url' : response.request.url) 在字典中保存 URLs 时,Scrapy 从 Scrapy-Splash 中保存 URLs 都是一样的 ( http://localhost:8050/render.html)
我试过添加额外的参数,这些参数会传递真实的 URL 但无济于事。
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
from scrapy import Request
import scrapy
from scrapy_splash import SplashRequest
class QuotesJSSpider(scrapy.Spider):
name = 'quotesjs'
start_urls = ('https://www.facebook.com/login',)
custom_settings = {
'SPLASH_URL': 'http://localhost:8050',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
}
def parse(self, response):
token = response.xpath('//*[@id="u_0_a"]').extract_first()
return FormRequest.from_response(response,
formdata={'lgndim' : token,
'pass': 'xxx',
'email': 'xxxx'},
callback=self.load_sites)
def load_sites(self, response):
urls = [
'https://www.facebook.com/page1/about',
'https://www.facebook.com/page2/about',
]
for url in urls:
yield SplashRequest(url=url, callback=self.scrape_pages)
def scrape_pages(self, response):
shops = {
'company_name' : response.css('title::text').extract(),
'url' : response.request.url,
}
yield shops
结果应该是这样的: 'url' : https://www.facebook.com/page1/about'
而不是这个: 'url' : http://localhost:8050/render.html,
原始请求 url 可在此处获得:response.request._original_url
。
为避免必须访问内部属性,您还可以尝试:
- 传递元中的 url:
def load_sites(self, response):
urls = [
'https://www.facebook.com/page1/about',
'https://www.facebook.com/page2/about',
]
for url in urls:
yield SplashRequest(url=url, callback=self.scrape_pages, meta={'original_url': url})
def scrape_pages(self, response)
shops = {
'company_name' : response.css('title::text').extract(),
'url' : response.meta['original_url'],
}
yield shops
- 使用响应中的 url:
def scrape_pages(self, response):
shops = {
'company_name' : response.css('title::text').extract(),
'url' : response.url,
}