如何将不同的 scrapy spider 结果通过管道传输到不同的表中
How to pipeline different scrapy spider results into different tables
我在同一个文件中创建了网络蜘蛛,从不同的网站抓取数据。当我 运行 每个蜘蛛独立时,我可以成功地将抓取的数据从 models.py 文件传输到 postgresql table 中。但是当我 运行 两个蜘蛛同时使用 api 时,每个蜘蛛的 table 都会创建,但由于某种原因无法将数据抓取到它们中。我认为这可能与每次蜘蛛 运行 时调用定义的管道 class 的方式有关,但很难说清楚,因为文档并没有真正显示实现的内部结构.根据我提供的代码,我是否有正确的设置将数据传输到两个不同的 tables?
Models.py
from sqlalchemy.orm import sessionmaker
from models import Tickets, Tickets3, db_connect, create_vs_tickets_table, create_tc_tickets_table
class ComparatorPipeline(object):
"""Price comparison pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_vs_tickets_table(engine)
create_tc_tickets_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save tickets in the database.
This method is called for every item pipeline component.
"""
if spider.name == "comparator":
session = self.Session()
ticket = Tickets(**item)
try:
session.add(ticket)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
elif spider.name == "comparator3":
session = self.Session()
ticket3 = Tickets3(**item)
try:
session.add(ticket3)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
蜘蛛定义
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem3
from urlparse import urljoin
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
bandname = raw_input("Enter a bandname \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator3'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
configure_logging()
runner = CrawlerRunner()
@defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider)
yield runner.crawl(MySpider3)
reactor.stop()
crawl()
reactor.run()
我不太明白你的问题,但据我所知,你有两个表,两个不同的 类 代表一个数据库条目(Tickets 和 Tickets3)和一个将它们保存到数据库中的管道。
并且使用来自两个蜘蛛的结果调用这个单一的管道。为什么不在 process_item
函数中使用 spider
变量来区分不同的元素?蜘蛛有一个名字,因此您可以使用工具查看哪个蜘蛛发送了要处理的项目。
将 custom_settings 添加到您的蜘蛛程序
class MySpider(CrawlSpider):
custom_settings = {
'ITEM_PIPELINES': {
'pipelines.ComparatorPipeline': 400
}
}
handle_httpstatus_list = [416]
并在您的设置中禁用项目管道列表
# ITEM_PIPELINES = {
# }
我在同一个文件中创建了网络蜘蛛,从不同的网站抓取数据。当我 运行 每个蜘蛛独立时,我可以成功地将抓取的数据从 models.py 文件传输到 postgresql table 中。但是当我 运行 两个蜘蛛同时使用 api 时,每个蜘蛛的 table 都会创建,但由于某种原因无法将数据抓取到它们中。我认为这可能与每次蜘蛛 运行 时调用定义的管道 class 的方式有关,但很难说清楚,因为文档并没有真正显示实现的内部结构.根据我提供的代码,我是否有正确的设置将数据传输到两个不同的 tables?
Models.py
from sqlalchemy.orm import sessionmaker
from models import Tickets, Tickets3, db_connect, create_vs_tickets_table, create_tc_tickets_table
class ComparatorPipeline(object):
"""Price comparison pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_vs_tickets_table(engine)
create_tc_tickets_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save tickets in the database.
This method is called for every item pipeline component.
"""
if spider.name == "comparator":
session = self.Session()
ticket = Tickets(**item)
try:
session.add(ticket)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
elif spider.name == "comparator3":
session = self.Session()
ticket3 = Tickets3(**item)
try:
session.add(ticket3)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
蜘蛛定义
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem3
from urlparse import urljoin
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
bandname = raw_input("Enter a bandname \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator3'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
configure_logging()
runner = CrawlerRunner()
@defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider)
yield runner.crawl(MySpider3)
reactor.stop()
crawl()
reactor.run()
我不太明白你的问题,但据我所知,你有两个表,两个不同的 类 代表一个数据库条目(Tickets 和 Tickets3)和一个将它们保存到数据库中的管道。
并且使用来自两个蜘蛛的结果调用这个单一的管道。为什么不在 process_item
函数中使用 spider
变量来区分不同的元素?蜘蛛有一个名字,因此您可以使用工具查看哪个蜘蛛发送了要处理的项目。
将 custom_settings 添加到您的蜘蛛程序
class MySpider(CrawlSpider):
custom_settings = {
'ITEM_PIPELINES': {
'pipelines.ComparatorPipeline': 400
}
}
handle_httpstatus_list = [416]
并在您的设置中禁用项目管道列表
# ITEM_PIPELINES = {
# }