Scrapy :: CSV 导出问题
Scrapy :: Issues with CSV exporting
我正在尝试使用 Scrapy 将抓取的项目导出到 CSV 字段中,每个字段都用双引号引起来。目前,CSV 导出正确,但当我尝试修改项目字段并手动添加双引号时,CSV 最终每个字段都用三重双引号引起来。这是我正在尝试做的一个例子:
Scrapy 代码
import scrapy
from tutorial.items import StoreItem
class SecilSpider(scrapy.Spider):
name = "secil"
allowed_domains = ["secilstore.com"]
def start_requests(self):
start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
return [ scrapy.Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
item = StoreItem()
for url in response.xpath('//div[@class="image"]/a/@href').extract():
yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
baseUrl = response.request.headers.get('Referer', None)
if baseUrl is not None:
baseUrl = baseUrl.split('Sayfa')[0]
color = response.xpath('//a[@class="renk"]/text()').extract()
for c in color:
item['url'] = baseUrl
item['productUrl'] = response.url
item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[@id="productMainImage"]/@src').extract()[0]
item['color'] = c
item['price'] = response.xpath('//span[@class="price cufonHover"]/text()').extract()[0] + "TL"
item['title'] = response.xpath('//h2[@class="cufon"]/text()').extract()
item['brand'] = response.xpath('//h3[@class="slogan cufonSemi"]/text()').extract()[0]
size = '|'.join(s.strip() for s in response.xpath('//a[@class="inStock"]/text()').extract())
item['size'] = size if size else -1
oldPrice = response.xpath('//div[@class="indirimFiyat"]/text()').extract()
item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
items.append(item)
yield item
我的 CSV 项目管道
class CSVPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, False,'"')
self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
所以当,我尝试修改蜘蛛中的一个字段并像这样手动添加双引号(fpr示例,对于项目['url']):
item['url'] = '"%s"' % baseUrl
生成的 CSV 打印出以下内容:
"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1
如您所见,第一个字段被三重双引号包围,而不是只有一个。另外有趣的是,价格是用双引号引起来的。如何只用一对双引号将每个字段括起来?
谢谢!
我通过修改CSVItemPipeline找到的:
self.exporter = CsvItemExporter(open(spider.name+".csv", "w"), False,
fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)
这让我可以生成一个 CSV 文件,其中的字段用双引号括起来。
我正在尝试使用 Scrapy 将抓取的项目导出到 CSV 字段中,每个字段都用双引号引起来。目前,CSV 导出正确,但当我尝试修改项目字段并手动添加双引号时,CSV 最终每个字段都用三重双引号引起来。这是我正在尝试做的一个例子:
Scrapy 代码
import scrapy
from tutorial.items import StoreItem
class SecilSpider(scrapy.Spider):
name = "secil"
allowed_domains = ["secilstore.com"]
def start_requests(self):
start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
return [ scrapy.Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
item = StoreItem()
for url in response.xpath('//div[@class="image"]/a/@href').extract():
yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
baseUrl = response.request.headers.get('Referer', None)
if baseUrl is not None:
baseUrl = baseUrl.split('Sayfa')[0]
color = response.xpath('//a[@class="renk"]/text()').extract()
for c in color:
item['url'] = baseUrl
item['productUrl'] = response.url
item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[@id="productMainImage"]/@src').extract()[0]
item['color'] = c
item['price'] = response.xpath('//span[@class="price cufonHover"]/text()').extract()[0] + "TL"
item['title'] = response.xpath('//h2[@class="cufon"]/text()').extract()
item['brand'] = response.xpath('//h3[@class="slogan cufonSemi"]/text()').extract()[0]
size = '|'.join(s.strip() for s in response.xpath('//a[@class="inStock"]/text()').extract())
item['size'] = size if size else -1
oldPrice = response.xpath('//div[@class="indirimFiyat"]/text()').extract()
item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
items.append(item)
yield item
我的 CSV 项目管道
class CSVPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, False,'"')
self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
所以当,我尝试修改蜘蛛中的一个字段并像这样手动添加双引号(fpr示例,对于项目['url']):
item['url'] = '"%s"' % baseUrl
生成的 CSV 打印出以下内容:
"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1
如您所见,第一个字段被三重双引号包围,而不是只有一个。另外有趣的是,价格是用双引号引起来的。如何只用一对双引号将每个字段括起来?
谢谢!
我通过修改CSVItemPipeline找到的:
self.exporter = CsvItemExporter(open(spider.name+".csv", "w"), False,
fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)
这让我可以生成一个 CSV 文件,其中的字段用双引号括起来。