使用scrapy进行一级向下爬取
1-level down crawling using scrapy
我对 Python 很陌生。我正在尝试使用 scrapy 打印(并保存)网站中的所有博客文章。我希望蜘蛛只在主要内容部分抓取。这是我的代码
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from people.items import PeopleCommentItem
class people(CrawlSpider):
name="people"
allowed_domains=["http://blog.sina.com.cn/"]
start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
rules=[Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)), callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="articalContent "]//a/@href')))
]
def parse(self,response):
hxs=HtmlXPathSelector(response)
print hxs.select('//div[@class="articalContent "]//a/text()').extract()
之后没有打印任何内容:
DEBUG: Crawled (200) <GET http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html> (referer: None)
ScrapyDeprecationWarning: scrapy.selector.HtmlXPathSelector is deprecated, instantiate scrapy.Selector instead.
hxs=HtmlXPathSelector(response)
ScrapyDeprecationWarning: Call to deprecated function select. Use .xpath() instead.
titles= hxs.select('//div[@class="articalContent "]//a/text()').extract()
2015-03-09 15:46:47-0700 [people] INFO: Closing spider (finished)
有人可以告诉我哪里出了问题吗?
谢谢!!
我在这方面取得了一些成功:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
class people(CrawlSpider):
name="people"
allowed_domains=["http://blog.sina.com.cn/"]
start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
rules=(Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)), callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[contains(@class, "articalContent")]'))),
)
def parse(self,response):
links = Selector(text=response.body).xpath('//div[contains(@class, "articalContent")]//a//text()')
for link in links:
print link.extract()
我对 Python 很陌生。我正在尝试使用 scrapy 打印(并保存)网站中的所有博客文章。我希望蜘蛛只在主要内容部分抓取。这是我的代码
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from people.items import PeopleCommentItem
class people(CrawlSpider):
name="people"
allowed_domains=["http://blog.sina.com.cn/"]
start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
rules=[Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)), callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="articalContent "]//a/@href')))
]
def parse(self,response):
hxs=HtmlXPathSelector(response)
print hxs.select('//div[@class="articalContent "]//a/text()').extract()
之后没有打印任何内容:
DEBUG: Crawled (200) <GET http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html> (referer: None)
ScrapyDeprecationWarning: scrapy.selector.HtmlXPathSelector is deprecated, instantiate scrapy.Selector instead.
hxs=HtmlXPathSelector(response)
ScrapyDeprecationWarning: Call to deprecated function select. Use .xpath() instead.
titles= hxs.select('//div[@class="articalContent "]//a/text()').extract()
2015-03-09 15:46:47-0700 [people] INFO: Closing spider (finished)
有人可以告诉我哪里出了问题吗?
谢谢!!
我在这方面取得了一些成功:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
class people(CrawlSpider):
name="people"
allowed_domains=["http://blog.sina.com.cn/"]
start_urls=["http://blog.sina.com.cn/s/blog_53d7b5ce0100e7y0.html"]
rules=(Rule(SgmlLinkExtractor(allow=("http://blog.sina.com.cn/",)), callback='parse_item', follow=True),
#restrict the crawling in the articalContent section only
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[contains(@class, "articalContent")]'))),
)
def parse(self,response):
links = Selector(text=response.body).xpath('//div[contains(@class, "articalContent")]//a//text()')
for link in links:
print link.extract()