scrapy 无法正常提取标题
scrapy isn't working right in extracting the title
在这段代码中,我想抓取链接内的标题、副标题和数据,但在 1 和 2 以外的页面上有问题,因为只得到 1 个项目 scraped.I 只想提取那些条目标题仅作为交货
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[@class="complaint"]/a/span[@style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[@class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[@class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[@class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[@class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
您需要将项目['title']更改为:
item['title'] = ''.join(site.xpath('//table[@width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
同时编辑网站以仅提取所需的 links(其中包含 Delhivery)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
编辑:
所以我现在明白您需要在代码中添加分页规则。
它应该是这样的:
您只需要添加导入并从 项目的 link 本身编写新的 xpath,例如 this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[@class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
我还建议你在编写 xpath 时不要使用任何样式参数,尝试使用 @class 或 @id,仅使用 @width、@style 或任何样式参数,如果它是唯一的办法。
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[@class="complaint"]/a/span[@style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[@class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[@class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[@class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[@class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
您需要将项目['title']更改为:
item['title'] = ''.join(site.xpath('//table[@width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
同时编辑网站以仅提取所需的 links(其中包含 Delhivery)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
编辑: 所以我现在明白您需要在代码中添加分页规则。 它应该是这样的: 您只需要添加导入并从 项目的 link 本身编写新的 xpath,例如 this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[@class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
我还建议你在编写 xpath 时不要使用任何样式参数,尝试使用 @class 或 @id,仅使用 @width、@style 或任何样式参数,如果它是唯一的办法。