Scrapy 不获取 response.css 上的标记

Scrapy does not fetch markup on response.css

我在 scrapinghub 上构建了一个简单的 scrapy 蜘蛛 运行:

class ExtractionSpider(scrapy.Spider):
    name = "extraction"
    allowed_domains = ['domain']
    start_urls = ['http://somedomainstart']
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"

    def parse(self, response):
        urls = response.css('a.offer-details__title-link::attr(href)').extract()

        print(urls)
        for url in urls:
            url = response.urljoin(url)
            yield SplashRequest(url=url, callback=self.parse_details)

        multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
        print(multiple_locs_urls)        
        for url in multiple_locs_urls:
            url = response.urljoin(url)
            yield SplashRequest(url=url, callback=self.parse_details)

        next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
        if next_page_url:
            next_page_url = response.urljoin(next_page_url)
            yield SplashRequest(url=next_page_url, callback=self.parse)

    def parse_details(self, response): 
        yield {
        'title': response.css('#jobTitle').extract_first(),
        'content': response.css('#description').extract_first(),
        'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
        'address': response.css('span[itemprop="address"]').extract_first()
        }

我面临的问题是 multiple_locs_url response.css returns 是一个空数组,尽管我在浏览器端的标记中看到了它。

我用 scrapy shell 检查过,scrapy shell 没有看到标记。我想这是由于加载页面时通过 javascript 呈现的标记所致。

我添加了启动画面,但这似乎不适用于响应。我如何让 scrapy 等待查询直到页面加载完成?

查看页面源代码:view-source:pracuj.pl/praca/polska;ct,1 。 html 代码中没有 class "offer-regions__label" 的元素。

这段代码总是return一个空列表:

multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')

但正如此处所述:

Many times when crawling we run into problems where content that is rendered on the page is generated with Javascript and therefore scrapy is unable to crawl for it.

在这种情况下,您可以使用 Selenium。 我更改了您的代码并检查了它,它有效:

class ExtractionSpider(scrapy.Spider):
    name = "extraction"
    allowed_domains = ['domain']
    start_urls = ['http://somedomainstart']
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"

    def __init__( self, **kwargs ):

        super().__init__( **kwargs )

        profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
        firefox_binary = "pathToFirefoxBinary"  # Must be the developer edition!!!
        # self.driver = webdriver.Firefox()
        self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )

    def parse(self, response):

        self.driver.get( response.url )

        elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
        self.driver.get( response.url )
        for element in elements:
            print( "****" )
            print( str( element.get_attribute( "href" ) ) )
            print( str( element.text ) )

        # your old code below

        urls = response.css('a.offer-details__title-link::attr(href)').extract()

        print(urls)
        for url in urls:
            url = response.urljoin(url)
            yield SplashRequest(url=url, callback=self.parse_details)

        multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
        print(multiple_locs_urls)        
        for url in multiple_locs_urls:
            url = response.urljoin(url)
            yield SplashRequest(url=url, callback=self.parse_details)

        next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
        if next_page_url:
            next_page_url = response.urljoin(next_page_url)
            yield SplashRequest(url=next_page_url, callback=self.parse)

    def parse_details(self, response): 
        yield {
        'title': response.css('#jobTitle').extract_first(),
        'content': response.css('#description').extract_first(),
        'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
        'address': response.css('span[itemprop="address"]').extract_first()
        }