全局 ItemLoader - 在多个蜘蛛之间共享
Global ItemLoader - Share among multiplespiders
我是 Scrapy/Python 的新手。
我想抓取多个网站,但我只能从每个网站获取三个项目 "date" "cota" 和 "name",它们每天更新并且始终具有相同的 xpath
抓取所有这些之后,我想导出到一个 csv 文件,但是通过我的代码,我得到了
以下格式
但我想要这样的东西
我特别询问了关于在多个蜘蛛之间共享同一个 ItemLoader 的问题,因为这是我想到的,但我对其他替代方案持开放态度。
这是我目前为两个网站准备的脚本,稍后我会添加更多蜘蛛:
顺便说一下,如果使用这样的代码,考虑到 scrapy 是异步的,是否有可能混淆值?
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class fundo(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
cota = scrapy.Field()
date = scrapy.Field()
class ModalSpider(scrapy.Spider):
name = 'modal'
allowed_domains = ['modalasset.com.br']
start_urls = ['http://modalasset.com.br/cotas-diarias/']
def parse(self, response):
l = ItemLoader(item=fundo(),response=response)
name = response.xpath("//tr[@class='row-6 even']/td/a/text()").extract_first()
date = response.xpath("//tr[@class='row-6 even']/td/text()")[0].extract()
cota = response.xpath("//tr[@class='row-6 even']/td/text()")[1].extract()
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
return l.load_item()
class KapitaloSpider(scrapy.Spider):
name = 'kapitalo'
allowed_domains = ['kapitalo.com.br/relatorios.']
start_urls = ['http://kapitalo.com.br/relatorios.html']
def parse(self, response):
l = ItemLoader(item=fundo(),response=response)
name = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[0].extract()
date = response.xpath("//*[@class='event layout_full block bygone']/h2/text()")[0].extract()
date = date.replace(' Cotas do Dia: ','')
cota = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[1].extract()
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
return l.load_item()
process = CrawlerProcess({
'FEED_FORMAT': 'csv',
'FEED_URI': 'result.csv'
})
process.crawl(ModalSpider)
process.crawl(KapitaloSpider)
process.start() # the script will block here until all crawling jobs are finished
我尝试的另一种方法是使用以下代码,但 add_value 正在替换 ItemLoader 中的旧值,无法弄清楚原因。
所以它只返回上一个网站的值。
我宁愿使用第一个代码,因为它允许我使用不同种类的蜘蛛,并且对于其中一个网站我可能需要使用 Selenium。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader import ItemLoader
from scrapy.http import Request
class FundoItem(scrapy.Item):
name = scrapy.Field()
date = scrapy.Field()
cota = scrapy.Field()
class RankingSpider(scrapy.Spider):
name = 'Ranking'
allowed_domains = ['modalasset.com.br',
'kapitalo.com.br'
]
start_urls = ['http://modalasset.com.br/cotas-diarias/']
def parse(self, response):
l = ItemLoader(item=FundoItem(),response=response)
name = response.xpath("//tr[@class='row-6 even']/td/a/text()").extract_first()
date = response.xpath("//tr[@class='row-6 even']/td/text()")[0].extract()
cota = response.xpath("//tr[@class='row-6 even']/td/text()")[1].extract()
#item['name'] = name
#item['date'] = date
#item['cota'] = cota
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
yield Request(url = "http://kapitalo.com.br/relatorios.html",
callback = self.parse_2,
meta={'item':l.load_item()})
def parse_2 (self,response):
name = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[0].extract()
date = response.xpath("//*[@class='event layout_full block bygone']/h2/text()")[0].extract()
date = date.replace(' Cotas do Dia: ','')
cota = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[1].extract()
l = ItemLoader(item=response.meta['item'])
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
return l.load_item()
由于 windows 如何解释 csv 文件中的换行符,这似乎是一个问题。
我通过使用以下代码而不是 FEED_EXPORT:
解决了
import csv
with open('test.csv',mode='a',newline='\n') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,date,cota,url])
newline = '\n'
解决了问题。
结束码变成:
#Scrapy framework
#CrawlerProcess to run multiple spiders
#csv to export
#sys,inspect to find all the classes(spiders) in this script
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
import sys, inspect
import datetime
#SPIDER DEFINITIONS
class ModalSpider(scrapy.Spider):
name = 'modal'
allowed_domains = ['modalasset.com.br']
start_urls = ['http://modalasset.com.br/cotas-diarias/']
def parse(self, response):
name = response.xpath("//tr[@class='row-6 even']/td/a/text()").extract_first()
date = response.xpath("//tr[@class='row-6 even']/td/text()")[0].extract()
cota = response.xpath("//tr[@class='row-6 even']/td/text()")[2].extract()
write(name,date,cota,response.request.url)
class KapitaloSpider(scrapy.Spider):
name = 'kapitalo'
allowed_domains = ['kapitalo.com.br/relatorios.']
start_urls = ['http://kapitalo.com.br/relatorios.html']
def parse(self, response):
#Zeta FIQ FIM
name = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[0].extract()
date = response.xpath("//*[@class='event layout_full block bygone']/h2/text()")[0].extract()
date = date.replace(' Cotas do Dia: ','')
date = date.replace('.','/')
cota = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[1].extract()
write(name,date,cota,response.request.url)
#Kappa FIN FIQ FIM
name = response.xpath("//tr[@class='odd']")[0].xpath("td//text()")[0].extract()
cota = response.xpath("//tr[@class='odd']")[0].xpath("td//text()")[1].extract()
write(name,date,cota,response.request.url)
#write to csv file
#newline='\n' so it won't jump any lines between the entries
def write(name,date,cota,url):
with open('test.csv',mode='a',newline='\n') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,date,cota,url])
def crawl():
#create the columns in the csv file
with open('test.csv',mode="w",newline='\n') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['Nome do Fundo','Data','Cota do dia','URL'])
#get all the members from the script
#if it's a class and it's inside the main (it's a spider)
#then it crawls
process = CrawlerProcess()
for name, obj in inspect.getmembers(sys.modules[__name__]):
if inspect.isclass(obj):
if obj.__module__ == '__main__':
process.crawl(obj)
process.start()
crawl()
我是 Scrapy/Python 的新手。
我想抓取多个网站,但我只能从每个网站获取三个项目 "date" "cota" 和 "name",它们每天更新并且始终具有相同的 xpath
抓取所有这些之后,我想导出到一个 csv 文件,但是通过我的代码,我得到了 以下格式
但我想要这样的东西
我特别询问了关于在多个蜘蛛之间共享同一个 ItemLoader 的问题,因为这是我想到的,但我对其他替代方案持开放态度。
这是我目前为两个网站准备的脚本,稍后我会添加更多蜘蛛:
顺便说一下,如果使用这样的代码,考虑到 scrapy 是异步的,是否有可能混淆值?
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class fundo(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
cota = scrapy.Field()
date = scrapy.Field()
class ModalSpider(scrapy.Spider):
name = 'modal'
allowed_domains = ['modalasset.com.br']
start_urls = ['http://modalasset.com.br/cotas-diarias/']
def parse(self, response):
l = ItemLoader(item=fundo(),response=response)
name = response.xpath("//tr[@class='row-6 even']/td/a/text()").extract_first()
date = response.xpath("//tr[@class='row-6 even']/td/text()")[0].extract()
cota = response.xpath("//tr[@class='row-6 even']/td/text()")[1].extract()
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
return l.load_item()
class KapitaloSpider(scrapy.Spider):
name = 'kapitalo'
allowed_domains = ['kapitalo.com.br/relatorios.']
start_urls = ['http://kapitalo.com.br/relatorios.html']
def parse(self, response):
l = ItemLoader(item=fundo(),response=response)
name = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[0].extract()
date = response.xpath("//*[@class='event layout_full block bygone']/h2/text()")[0].extract()
date = date.replace(' Cotas do Dia: ','')
cota = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[1].extract()
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
return l.load_item()
process = CrawlerProcess({
'FEED_FORMAT': 'csv',
'FEED_URI': 'result.csv'
})
process.crawl(ModalSpider)
process.crawl(KapitaloSpider)
process.start() # the script will block here until all crawling jobs are finished
我尝试的另一种方法是使用以下代码,但 add_value 正在替换 ItemLoader 中的旧值,无法弄清楚原因。 所以它只返回上一个网站的值。 我宁愿使用第一个代码,因为它允许我使用不同种类的蜘蛛,并且对于其中一个网站我可能需要使用 Selenium。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader import ItemLoader
from scrapy.http import Request
class FundoItem(scrapy.Item):
name = scrapy.Field()
date = scrapy.Field()
cota = scrapy.Field()
class RankingSpider(scrapy.Spider):
name = 'Ranking'
allowed_domains = ['modalasset.com.br',
'kapitalo.com.br'
]
start_urls = ['http://modalasset.com.br/cotas-diarias/']
def parse(self, response):
l = ItemLoader(item=FundoItem(),response=response)
name = response.xpath("//tr[@class='row-6 even']/td/a/text()").extract_first()
date = response.xpath("//tr[@class='row-6 even']/td/text()")[0].extract()
cota = response.xpath("//tr[@class='row-6 even']/td/text()")[1].extract()
#item['name'] = name
#item['date'] = date
#item['cota'] = cota
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
yield Request(url = "http://kapitalo.com.br/relatorios.html",
callback = self.parse_2,
meta={'item':l.load_item()})
def parse_2 (self,response):
name = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[0].extract()
date = response.xpath("//*[@class='event layout_full block bygone']/h2/text()")[0].extract()
date = date.replace(' Cotas do Dia: ','')
cota = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[1].extract()
l = ItemLoader(item=response.meta['item'])
l.add_value('name', name)
l.add_value('date', date)
l.add_value('cota', cota)
return l.load_item()
由于 windows 如何解释 csv 文件中的换行符,这似乎是一个问题。 我通过使用以下代码而不是 FEED_EXPORT:
解决了import csv
with open('test.csv',mode='a',newline='\n') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,date,cota,url])
newline = '\n'
解决了问题。
结束码变成:
#Scrapy framework
#CrawlerProcess to run multiple spiders
#csv to export
#sys,inspect to find all the classes(spiders) in this script
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
import sys, inspect
import datetime
#SPIDER DEFINITIONS
class ModalSpider(scrapy.Spider):
name = 'modal'
allowed_domains = ['modalasset.com.br']
start_urls = ['http://modalasset.com.br/cotas-diarias/']
def parse(self, response):
name = response.xpath("//tr[@class='row-6 even']/td/a/text()").extract_first()
date = response.xpath("//tr[@class='row-6 even']/td/text()")[0].extract()
cota = response.xpath("//tr[@class='row-6 even']/td/text()")[2].extract()
write(name,date,cota,response.request.url)
class KapitaloSpider(scrapy.Spider):
name = 'kapitalo'
allowed_domains = ['kapitalo.com.br/relatorios.']
start_urls = ['http://kapitalo.com.br/relatorios.html']
def parse(self, response):
#Zeta FIQ FIM
name = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[0].extract()
date = response.xpath("//*[@class='event layout_full block bygone']/h2/text()")[0].extract()
date = date.replace(' Cotas do Dia: ','')
date = date.replace('.','/')
cota = response.xpath("//tr[@class='odd']")[1].xpath("td//text()")[1].extract()
write(name,date,cota,response.request.url)
#Kappa FIN FIQ FIM
name = response.xpath("//tr[@class='odd']")[0].xpath("td//text()")[0].extract()
cota = response.xpath("//tr[@class='odd']")[0].xpath("td//text()")[1].extract()
write(name,date,cota,response.request.url)
#write to csv file
#newline='\n' so it won't jump any lines between the entries
def write(name,date,cota,url):
with open('test.csv',mode='a',newline='\n') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,date,cota,url])
def crawl():
#create the columns in the csv file
with open('test.csv',mode="w",newline='\n') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['Nome do Fundo','Data','Cota do dia','URL'])
#get all the members from the script
#if it's a class and it's inside the main (it's a spider)
#then it crawls
process = CrawlerProcess()
for name, obj in inspect.getmembers(sys.modules[__name__]):
if inspect.isclass(obj):
if obj.__module__ == '__main__':
process.crawl(obj)
process.start()
crawl()