使用循环子页面的 Scrapy 提取数据
Extracting data with Scrapy which loops subpages
我的网站上有一个包含工作人员列表的页面。每个工作人员姓名都链接到他们自己的个人页面。
我想输出一个 csv 文件,其中列出了每位员工的姓名和职位,因此蜘蛛程序需要遍历员工列表页面上的每个链接,提取姓名和职位。
至此,这个码字只把姓氏和职称拉出来就行了。我遇到的问题是让它通过每个人的页面以获得完整列表。
如何让这个循环工作?
class scrapeSpider(scrapy.Spider):
name = "scrape"
allowed_domains = ["example.com", "example.co.uk"]
start_urls = [
'http://example.com/stafflist/',
]
def parse(self, response):
for href in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_SCRAPE)
def parse_SCRAPE(self, response):
items = []
for sel in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span9")]'):
item = scrapeItem()
item['name'] = sel.xpath('h1/text()').extract()
item['titles'] = sel.xpath('h2/text()').extract()
print item['name'], item['titles']
items.append(item)
return items
使用CrawlSpider。例如
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from myspider.items import PersonItem
from pyquery import PyQuery as pq # PyQuery is awesome!
from urlparse import urlparse, parse_qs
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.si']
start_urls = ['http://example.com/stafflist/']
rules = (
# if you have paginator this Rule will extract links
Rule(LinkExtractor(
restrict_xpaths=('//div[@class="paging"]//a[last()]')),
follow=True),
# restrict crawler to look for links only inside restrict_xpaths
# and then process those links with 'parse_item'
Rule(LinkExtractor(
restrict_xpaths=('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href')),
callback='parse_item',
follow=False),
)
def parse_item(self, response):
"""
process persons page
"""
self.response = response
self.doc = pq(self.response.body)
i = PersonItem()
i["name"] = self.doc("h1").text()
i["titles"] = self.doc("h2").text()
...
return i
我的网站上有一个包含工作人员列表的页面。每个工作人员姓名都链接到他们自己的个人页面。
我想输出一个 csv 文件,其中列出了每位员工的姓名和职位,因此蜘蛛程序需要遍历员工列表页面上的每个链接,提取姓名和职位。
至此,这个码字只把姓氏和职称拉出来就行了。我遇到的问题是让它通过每个人的页面以获得完整列表。
如何让这个循环工作?
class scrapeSpider(scrapy.Spider):
name = "scrape"
allowed_domains = ["example.com", "example.co.uk"]
start_urls = [
'http://example.com/stafflist/',
]
def parse(self, response):
for href in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_SCRAPE)
def parse_SCRAPE(self, response):
items = []
for sel in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span9")]'):
item = scrapeItem()
item['name'] = sel.xpath('h1/text()').extract()
item['titles'] = sel.xpath('h2/text()').extract()
print item['name'], item['titles']
items.append(item)
return items
使用CrawlSpider。例如
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from myspider.items import PersonItem
from pyquery import PyQuery as pq # PyQuery is awesome!
from urlparse import urlparse, parse_qs
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.si']
start_urls = ['http://example.com/stafflist/']
rules = (
# if you have paginator this Rule will extract links
Rule(LinkExtractor(
restrict_xpaths=('//div[@class="paging"]//a[last()]')),
follow=True),
# restrict crawler to look for links only inside restrict_xpaths
# and then process those links with 'parse_item'
Rule(LinkExtractor(
restrict_xpaths=('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href')),
callback='parse_item',
follow=False),
)
def parse_item(self, response):
"""
process persons page
"""
self.response = response
self.doc = pq(self.response.body)
i = PersonItem()
i["name"] = self.doc("h1").text()
i["titles"] = self.doc("h2").text()
...
return i