使用scrapy进行一级向下爬取

1-level down crawling using scrapy

我对 Python 很陌生。我正在尝试使用 scrapy 打印(并保存)网站中的所有博客文章。我希望蜘蛛只在主要内容部分抓取。这是我的代码

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from people.items import PeopleCommentItem

class people(CrawlSpider):
name="people"
  allowed_domains=["http://blog.sina.com.cn/"]
  start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
  rules=[Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)),  callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
  Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="articalContent   "]//a/@href')))
  ]

  def parse(self,response):
      hxs=HtmlXPathSelector(response)
      print hxs.select('//div[@class="articalContent   "]//a/text()').extract()

之后没有打印任何内容:

DEBUG: Crawled (200) <GET http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html> (referer: None)
ScrapyDeprecationWarning: scrapy.selector.HtmlXPathSelector is deprecated, instantiate scrapy.Selector instead.
  hxs=HtmlXPathSelector(response)
ScrapyDeprecationWarning: Call to deprecated function select. Use .xpath() instead.
  titles= hxs.select('//div[@class="articalContent   "]//a/text()').extract()
2015-03-09 15:46:47-0700 [people] INFO: Closing spider (finished)

有人可以告诉我哪里出了问题吗?

谢谢!!

我在这方面取得了一些成功:

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request

class people(CrawlSpider):
  name="people"
  allowed_domains=["http://blog.sina.com.cn/"]
  start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
  rules=(Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)),  callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
  Rule(SgmlLinkExtractor(restrict_xpaths=('//div[contains(@class, "articalContent")]'))),
  )

  def parse(self,response):
      links = Selector(text=response.body).xpath('//div[contains(@class, "articalContent")]//a//text()')
      for link in links:
          print link.extract()