Scrapy:将解析后的数据导出到多个文件中
Scrapy: export parsed data into multiple files
我想解析页面,然后将某些项目导出到一个 csv 文件,将其他项目导出到另一个文件:
使用 feed exports here 我设法为一个文件做了如下操作:
设置
FEED_EXPORT_FIELDS = (
'url',
'group_url',
'name',
'streetAddress',
'city',
'addressCountry',
)
FEED_FORMAT = 'csv'
FEED_URI = 'output/%(name)s_%(time)s.csv'
但正如我所说,以上内容仅导出到一个 csv 文件。
我希望能够将其他字段抓取到另一个文件:
FEED_EXPORT_FIELDS = (
'employee',
'employee_group',
)
我的爬虫解析:
def parse(self, response):
l = BasicItemLoader(item=ProductItemLoader(), response=response)
l.default_input_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
# l.default_output_processor = Compose(TakeFirst())
l.add_value('url', response.request.url)
l.add_value('group_url', response.meta.get('section', ''))
l.add_css('name', 'h1[itemprop="name"]::text')
l.add_css('streetAddress', "div[itemprop=\"address\"] [itemprop=\"streetAddress\"]::text")
l.add_css('city', "div[itemprop=\"address\"]>p::text")
l.add_css('addressCountry', "div[itemprop=\"address\"] [itemprop=\"addressCountry\"]::text")
l.add_css('phone', ".phoneCompany>input[id*='freePhone']::attr(value)", TakeFirst())
l.add_css('summary', 'span[itemprop="description"]::text')
l.add_xpath('year', "//td[contains(text(),'Year established')]/following-sibling::td/text()")
l.add_xpath('registry', "//td[contains(text(),'Registry of commerce')]/following-sibling::td/text()")
l.add_xpath('legal_form', "//td[contains(text(),'Legal form')]/following-sibling::td/text()")
l.add_xpath('vat', "//td[contains(text(),'VAT')]/following-sibling::td/text()")
l.add_xpath('fax', "//td[contains(text(),'Fax')]/following-sibling::td/text()")
l.add_css('website', "[id*='webSite_presentation_']::text")
l.add_css('brands', "#company-tradenames .tradeName::text")
l.add_xpath('banks', "//h3[contains(text(),'Banks')]/following-sibling::div//strong/text()")
l.add_css('export_area', "#exportZones>span:nth-of-type(2)::text")
l.add_css('import_area', "#importZones>span:nth-of-type(2)::text")
l.add_css('export_countries', "#exportCountries>span:nth-of-type(2)::text")
l.add_css('import_countries', "#importCountries>span:nth-of-type(2)::text")
l.add_css('employees', ".employees.bloc .effectif p::text")
l.add_css('turn_over', ".turnover.bloc li:nth-of-type(1)>p:nth-of-type(2)::text")
return l.load_item()
和项目定义
class ProductItemLoader(scrapy.Item):
url = scrapy.Field()
group_url = scrapy.Field()
name = scrapy.Field()
streetAddress = scrapy.Field()
addressCountry = scrapy.Field()
city = scrapy.Field()
phone = scrapy.Field()
summary = scrapy.Field()
year = scrapy.Field()
registry = scrapy.Field()
legal_form = scrapy.Field()
vat = scrapy.Field()
fax = scrapy.Field()
website = scrapy.Field()
brands = scrapy.Field()
banks = scrapy.Field()
import_area = scrapy.Field()
import_countries = scrapy.Field()
export_area = scrapy.Field()
export_countries = scrapy.Field()
employees = scrapy.Field()
turn_over = scrapy.Field()
您将不得不使用您的项目定义来实现将不同的字段保存到它们自己的 csv 文件中。
items.py:
import scrapy
class ProductItemLoader(scrapy.Item):
url = scrapy.Field()
group_url = scrapy.Field()
name = scrapy.Field()
streetAddress = scrapy.Field()
addressCountry = scrapy.Field()
city = scrapy.Field()
phone = scrapy.Field()
summary = scrapy.Field()
year = scrapy.Field()
registry = scrapy.Field()
legal_form = scrapy.Field()
vat = scrapy.Field()
class EmployeeLoader(scrapy.Item):
fax = scrapy.Field()
website = scrapy.Field()
brands = scrapy.Field()
banks = scrapy.Field()
import_area = scrapy.Field()
import_countries = scrapy.Field()
export_area = scrapy.Field()
export_countries = scrapy.Field()
employees = scrapy.Field()
turn_over = scrapy.Field()
pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# The CSV file names are used (imported) from the scrapy spider.
return type(item)
class YourSitePipelineHere(object):
# For simplicity, I'm using the same class def names as found in the,
# main scrapy spider and as defined in the items.py
fileNamesCsv = ['ProductItemLoader','EmployeeLoader']
def __init__(self):
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/projec_name"+name+'.csv','wb')) for name in self.fileNamesCsv ])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'ProductItemLoader':
self.exporters[name].fields_to_export = ['url','group_url','name','streetAddress','addressCountry','city','phone','summary','year','registry','legal_form','vat']
self.exporters[name].start_exporting()
if name == 'EmployeeLoader':
self.exporters[name].fields_to_export = ['fax','website','brands','bank','import_area','import_countries','export_area','export_countries','employees','turn_over']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
尼尔
#items.py
import scrapy
class JnuItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
#pipelines.py
from itemadapter import ItemAdapter
from scrapy.exporters import CsvItemExporter
class SeminarPipeline:
def __init__(self):
self.file = None
def open_spider(self,spider):
self.files={}
def close_spider(self,spider):
for exporter in self.files.values():
exporter.finish_exporting()
def file_name (self,item):
adopter = ItemAdapter(item)
title = adopter['title']
string = str(title).lower()
if 'webinar' in string:
exporter = CsvItemExporter(open('webinar7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title','link' ]
exporter.start_exporting()
self.files['webinar']=exporter
return self.files['webinar']
elif 'workshop' in string:
exporter = CsvItemExporter(open('workshop7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title', 'link']
exporter.start_exporting()
self.files['workshop'] = exporter
return self.files['workshop']
elif 'conference' in string:
exporter = CsvItemExporter(open('conference7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title', 'link']
exporter.start_exporting()
self.files['conference'] = exporter
return self.files['conference']
def process_item(self, item, spider):
exporter = self.file_name(item)
exporter.export_item(item)
return item
#settings.py
ITEM_PIPELINES = {'seminar.pipelines.SeminarPipeline': 300,}
我想解析页面,然后将某些项目导出到一个 csv 文件,将其他项目导出到另一个文件: 使用 feed exports here 我设法为一个文件做了如下操作:
设置
FEED_EXPORT_FIELDS = (
'url',
'group_url',
'name',
'streetAddress',
'city',
'addressCountry',
)
FEED_FORMAT = 'csv'
FEED_URI = 'output/%(name)s_%(time)s.csv'
但正如我所说,以上内容仅导出到一个 csv 文件。 我希望能够将其他字段抓取到另一个文件:
FEED_EXPORT_FIELDS = (
'employee',
'employee_group',
)
我的爬虫解析:
def parse(self, response):
l = BasicItemLoader(item=ProductItemLoader(), response=response)
l.default_input_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
# l.default_output_processor = Compose(TakeFirst())
l.add_value('url', response.request.url)
l.add_value('group_url', response.meta.get('section', ''))
l.add_css('name', 'h1[itemprop="name"]::text')
l.add_css('streetAddress', "div[itemprop=\"address\"] [itemprop=\"streetAddress\"]::text")
l.add_css('city', "div[itemprop=\"address\"]>p::text")
l.add_css('addressCountry', "div[itemprop=\"address\"] [itemprop=\"addressCountry\"]::text")
l.add_css('phone', ".phoneCompany>input[id*='freePhone']::attr(value)", TakeFirst())
l.add_css('summary', 'span[itemprop="description"]::text')
l.add_xpath('year', "//td[contains(text(),'Year established')]/following-sibling::td/text()")
l.add_xpath('registry', "//td[contains(text(),'Registry of commerce')]/following-sibling::td/text()")
l.add_xpath('legal_form', "//td[contains(text(),'Legal form')]/following-sibling::td/text()")
l.add_xpath('vat', "//td[contains(text(),'VAT')]/following-sibling::td/text()")
l.add_xpath('fax', "//td[contains(text(),'Fax')]/following-sibling::td/text()")
l.add_css('website', "[id*='webSite_presentation_']::text")
l.add_css('brands', "#company-tradenames .tradeName::text")
l.add_xpath('banks', "//h3[contains(text(),'Banks')]/following-sibling::div//strong/text()")
l.add_css('export_area', "#exportZones>span:nth-of-type(2)::text")
l.add_css('import_area', "#importZones>span:nth-of-type(2)::text")
l.add_css('export_countries', "#exportCountries>span:nth-of-type(2)::text")
l.add_css('import_countries', "#importCountries>span:nth-of-type(2)::text")
l.add_css('employees', ".employees.bloc .effectif p::text")
l.add_css('turn_over', ".turnover.bloc li:nth-of-type(1)>p:nth-of-type(2)::text")
return l.load_item()
和项目定义
class ProductItemLoader(scrapy.Item):
url = scrapy.Field()
group_url = scrapy.Field()
name = scrapy.Field()
streetAddress = scrapy.Field()
addressCountry = scrapy.Field()
city = scrapy.Field()
phone = scrapy.Field()
summary = scrapy.Field()
year = scrapy.Field()
registry = scrapy.Field()
legal_form = scrapy.Field()
vat = scrapy.Field()
fax = scrapy.Field()
website = scrapy.Field()
brands = scrapy.Field()
banks = scrapy.Field()
import_area = scrapy.Field()
import_countries = scrapy.Field()
export_area = scrapy.Field()
export_countries = scrapy.Field()
employees = scrapy.Field()
turn_over = scrapy.Field()
您将不得不使用您的项目定义来实现将不同的字段保存到它们自己的 csv 文件中。
items.py:
import scrapy
class ProductItemLoader(scrapy.Item):
url = scrapy.Field()
group_url = scrapy.Field()
name = scrapy.Field()
streetAddress = scrapy.Field()
addressCountry = scrapy.Field()
city = scrapy.Field()
phone = scrapy.Field()
summary = scrapy.Field()
year = scrapy.Field()
registry = scrapy.Field()
legal_form = scrapy.Field()
vat = scrapy.Field()
class EmployeeLoader(scrapy.Item):
fax = scrapy.Field()
website = scrapy.Field()
brands = scrapy.Field()
banks = scrapy.Field()
import_area = scrapy.Field()
import_countries = scrapy.Field()
export_area = scrapy.Field()
export_countries = scrapy.Field()
employees = scrapy.Field()
turn_over = scrapy.Field()
pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# The CSV file names are used (imported) from the scrapy spider.
return type(item)
class YourSitePipelineHere(object):
# For simplicity, I'm using the same class def names as found in the,
# main scrapy spider and as defined in the items.py
fileNamesCsv = ['ProductItemLoader','EmployeeLoader']
def __init__(self):
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/projec_name"+name+'.csv','wb')) for name in self.fileNamesCsv ])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'ProductItemLoader':
self.exporters[name].fields_to_export = ['url','group_url','name','streetAddress','addressCountry','city','phone','summary','year','registry','legal_form','vat']
self.exporters[name].start_exporting()
if name == 'EmployeeLoader':
self.exporters[name].fields_to_export = ['fax','website','brands','bank','import_area','import_countries','export_area','export_countries','employees','turn_over']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
尼尔
#items.py
import scrapy
class JnuItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
#pipelines.py
from itemadapter import ItemAdapter
from scrapy.exporters import CsvItemExporter
class SeminarPipeline:
def __init__(self):
self.file = None
def open_spider(self,spider):
self.files={}
def close_spider(self,spider):
for exporter in self.files.values():
exporter.finish_exporting()
def file_name (self,item):
adopter = ItemAdapter(item)
title = adopter['title']
string = str(title).lower()
if 'webinar' in string:
exporter = CsvItemExporter(open('webinar7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title','link' ]
exporter.start_exporting()
self.files['webinar']=exporter
return self.files['webinar']
elif 'workshop' in string:
exporter = CsvItemExporter(open('workshop7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title', 'link']
exporter.start_exporting()
self.files['workshop'] = exporter
return self.files['workshop']
elif 'conference' in string:
exporter = CsvItemExporter(open('conference7.csv', 'ab'), include_headers_line=False)
exporter.fields_to_export = ['date', 'title', 'link']
exporter.start_exporting()
self.files['conference'] = exporter
return self.files['conference']
def process_item(self, item, spider):
exporter = self.file_name(item)
exporter.export_item(item)
return item
#settings.py
ITEM_PIPELINES = {'seminar.pipelines.SeminarPipeline': 300,}