Scrapy:抓取 angular ng-href 链接?
Scrapy: Crawl angular ng-href links?
我正在使用 selenium-webdriver 为 scrapy 爬虫呈现 javascript,但看起来 angularjs 'ng-href' 链接未被抓取。 scrapy 是否抓取 'ng-href' 链接?如果没有,我怎样才能让它抓取 'ng-href' 链接?
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender
from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse
class HomeSpider(CrawlSpider):
name = "capseleniums"
allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]
def start_requests(self):
start_urls = reversed( [
'http://wm12.ecommerce.com/health-wellness-center/',
'http://wm13.ecommerce.com/Cook/',
'http://wm15.ecommerce.com/electronics-resource-center/',
'http://health.ecommerce.com/vitamins-wellness-center/',
'http://learn.ecommerce.com/Tips-Ideas/',
] )
return [ Request(url = start_url) for start_url in start_urls ]
def trim(link_text):
return link_text.strip(' \t\n\r')
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),
process_value=trim,
),
callback="parse_items",
follow=False,),
)
def __init__(self, category=None, *args, **kwargs):
self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
super(HomeSpider, self).__init__(*args, **kwargs)
def __del__(self):
self.driver.stop()
def parse_items(self, response):
hxs = self.driver
hxs.get(response.url)
time.sleep(1)
body = hxs.page_source
sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
hxs = Selector(sel_response)
sites = hxs.xpath('//html')
items = []
if response.status == 404:
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
else:
if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
By default,它会在 a
和 area
标签的 href
属性中查找链接。
您只需要额外配置 attrs
参数并包含 ng-href
属性:
LinkExtractor(attrs=['href', 'ng-href'], callback="parse_items", follow=False),
我正在使用 selenium-webdriver 为 scrapy 爬虫呈现 javascript,但看起来 angularjs 'ng-href' 链接未被抓取。 scrapy 是否抓取 'ng-href' 链接?如果没有,我怎样才能让它抓取 'ng-href' 链接?
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender
from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse
class HomeSpider(CrawlSpider):
name = "capseleniums"
allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]
def start_requests(self):
start_urls = reversed( [
'http://wm12.ecommerce.com/health-wellness-center/',
'http://wm13.ecommerce.com/Cook/',
'http://wm15.ecommerce.com/electronics-resource-center/',
'http://health.ecommerce.com/vitamins-wellness-center/',
'http://learn.ecommerce.com/Tips-Ideas/',
] )
return [ Request(url = start_url) for start_url in start_urls ]
def trim(link_text):
return link_text.strip(' \t\n\r')
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),
process_value=trim,
),
callback="parse_items",
follow=False,),
)
def __init__(self, category=None, *args, **kwargs):
self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
super(HomeSpider, self).__init__(*args, **kwargs)
def __del__(self):
self.driver.stop()
def parse_items(self, response):
hxs = self.driver
hxs.get(response.url)
time.sleep(1)
body = hxs.page_source
sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
hxs = Selector(sel_response)
sites = hxs.xpath('//html')
items = []
if response.status == 404:
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
elif hxs.xpath('//head/link[@rel="canonical"]/@href[contains(.,"invalid-category-id")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
else:
if hxs.xpath('//*[@class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
By default,它会在 a
和 area
标签的 href
属性中查找链接。
您只需要额外配置 attrs
参数并包含 ng-href
属性:
LinkExtractor(attrs=['href', 'ng-href'], callback="parse_items", follow=False),