使用循环子页面的 Scrapy 提取数据

Extracting data with Scrapy which loops subpages

我的网站上有一个包含工作人员列表的页面。每个工作人员姓名都链接到他们自己的个人页面。

我想输出一个 csv 文件,其中列出了每位员工的姓名和职位,因此蜘蛛程序需要遍历员工列表页面上的每个链接,提取姓名和职位。

至此,这个码字只把姓氏和职称拉出来就行了。我遇到的问题是让它通过每个人的页面以获得完整列表。

如何让这个循环工作?

class scrapeSpider(scrapy.Spider):
name = "scrape"
allowed_domains = ["example.com", "example.co.uk"]
start_urls = [
    'http://example.com/stafflist/',
]

def parse(self, response):
    for href in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href'):
        url = response.urljoin(href.extract())
        yield scrapy.Request(url, callback=self.parse_SCRAPE)


def parse_SCRAPE(self, response):
    items = []
    for sel in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span9")]'):
        item = scrapeItem()
        item['name'] = sel.xpath('h1/text()').extract()
        item['titles'] = sel.xpath('h2/text()').extract()
        print item['name'], item['titles']
        items.append(item)
    return items

使用CrawlSpider。例如

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from myspider.items import PersonItem

from pyquery import PyQuery as pq  # PyQuery is awesome!
from urlparse import urlparse, parse_qs

class MySpider(CrawlSpider):
    name = 'myspider'
    allowed_domains = ['example.si']
    start_urls = ['http://example.com/stafflist/']

    rules = (
        # if you have paginator this Rule will extract links
        Rule(LinkExtractor(
             restrict_xpaths=('//div[@class="paging"]//a[last()]')),
             follow=True),
        # restrict crawler to look for links only inside restrict_xpaths
        # and then process those links with 'parse_item'
        Rule(LinkExtractor(
             restrict_xpaths=('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href')),
             callback='parse_item',
             follow=False),
    )

    def parse_item(self, response):
        """
        process persons page
        """
        self.response = response
        self.doc = pq(self.response.body)

        i = PersonItem()
        i["name"] = self.doc("h1").text()
        i["titles"] = self.doc("h2").text()
        ...

        return i