Scrapy:抓取 angular ng-href 链接?

Scrapy: Crawl angular ng-href links?

我正在使用 selenium-webdriver 为 scrapy 爬虫呈现 javascript,但看起来 angularjs 'ng-href' 链接未被抓取。 scrapy 是否抓取 'ng-href' 链接?如果没有,我怎样才能让它抓取 'ng-href' 链接?

from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender

from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse

class HomeSpider(CrawlSpider):
    name = "capseleniums"
    allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
    handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]

    def start_requests(self):
        start_urls = reversed( [
            'http://wm12.ecommerce.com/health-wellness-center/',
            'http://wm13.ecommerce.com/Cook/',
            'http://wm15.ecommerce.com/electronics-resource-center/',
            'http://health.ecommerce.com/vitamins-wellness-center/',
            'http://learn.ecommerce.com/Tips-Ideas/',
            ] )
        return [ Request(url = start_url) for start_url in start_urls ]

    def trim(link_text):
        return link_text.strip(' \t\n\r')

    rules = (
        Rule(
            LinkExtractor(
                allow=(),
                deny=(),
                process_value=trim,
                ),
                callback="parse_items",
                follow=False,),
    )

    def __init__(self, category=None, *args, **kwargs):
        self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
        super(HomeSpider, self).__init__(*args, **kwargs)

    def __del__(self):
       self.driver.stop()

    def parse_items(self, response):
        hxs = self.driver
        hxs.get(response.url)
        time.sleep(1)
        body = hxs.page_source
        sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
        hxs = Selector(sel_response)
        sites = hxs.xpath('//html')
        items = []

        if response.status == 404:
            for site in sites:
                item = Website()
                item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                item['referer'] = response.request.headers.get('Referer')
                item['status'] = response.status
                items.append(item)

            return items

        if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
            for site in sites:
                item = Website()
                item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                item['referer'] = response.request.headers.get('Referer')
                item['status'] = response.status
                items.append(item)

            return items

        elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
            for site in sites:
                item = Website()
                item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                item['referer'] = response.request.headers.get('Referer')
                item['status'] = response.status
                items.append(item)

            return items

        else:
            if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
                for site in sites:
                    item = Website()
                    item['url'] = response.meta.get('redirect_urls', [response.url])[0]
                    item['referer'] = response.request.headers.get('Referer')
                    item['status'] = response.status
                    items.append(item)

                return items

By default,它会在 aarea 标签的 href 属性中查找链接。

您只需要额外配置 attrs 参数并包含 ng-href 属性:

LinkExtractor(attrs=['href', 'ng-href'], callback="parse_items", follow=False),