在链接中使用 scrapy 提取数据
using scrapy extracting data inside links
我一直在尝试从 consumercomplaints.in 标题和标题中的数据中提取数据 links.I 编写了以下代码,但无法通过链接解析并提取数据,我也是无法提取所有链接 related.plz 指南
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[@class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[@class="complaint"]/a/@href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[@class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[@class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[@class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[@class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
您使用的是旧版本的 Scrapy 吗?
在最新的稳定版本中,您不需要执行 hxs = Selector(response)
也不需要使用 hxs.select()
方法。你可以用 response.xpath()
.
做同样的事情
我认为您的代码中的问题在于 select()
(或 response.xpath
)的结果实际上是 Python list
,因此您需要执行以下操作:
link = site.select('.//td[@class="complaint"]/a/@href').extract()
if link:
item['link'] = link[0]
您可能也想对标题做类似的事情。
编辑:我做了一些改动:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
我一直在尝试从 consumercomplaints.in 标题和标题中的数据中提取数据 links.I 编写了以下代码,但无法通过链接解析并提取数据,我也是无法提取所有链接 related.plz 指南
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[@class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[@class="complaint"]/a/@href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[@class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[@class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[@class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[@class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
您使用的是旧版本的 Scrapy 吗?
在最新的稳定版本中,您不需要执行 hxs = Selector(response)
也不需要使用 hxs.select()
方法。你可以用 response.xpath()
.
我认为您的代码中的问题在于 select()
(或 response.xpath
)的结果实际上是 Python list
,因此您需要执行以下操作:
link = site.select('.//td[@class="complaint"]/a/@href').extract()
if link:
item['link'] = link[0]
您可能也想对标题做类似的事情。
编辑:我做了一些改动:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
yield old_item