Scrapy `ReactorNotRestartable`:一个 class 到 运行 两个(或更多)蜘蛛
Scrapy `ReactorNotRestartable`: one class to run two (or more) spiders
我正在使用两阶段抓取的 Scrapy 汇总每日数据。第一阶段从索引页生成 URL 的列表,第二阶段将列表中每个 URL 的 HTML 写入 Kafka 主题。
虽然抓取的两个组件是相关的,但我希望它们是独立的:url_generator
将 运行 作为每天一次的计划任务,page_requester
会持续 运行,在可用时处理 URL。为了成为 "polite",我将调整 DOWNLOAD_DELAY
以便爬虫在 24 小时内完成,但对网站的负载最小。
我创建了一个 CrawlerRunner
class,它具有生成 URL 和检索 HTML 的功能:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy_somesite.spiders.create_urls_spider import CreateSomeSiteUrlList
from scrapy_somesite.spiders.crawl_urls_spider import SomeSiteRetrievePages
from scrapy.utils.project import get_project_settings
import os
import sys
class CrawlerRunner:
def __init__(self):
sys.path.append(os.path.join(os.path.curdir, "crawl/somesite"))
os.environ['SCRAPY_SETTINGS_MODULE'] = 'scrapy_somesite.settings'
self.settings = get_project_settings()
log.start()
def create_urls(self):
spider = CreateSomeSiteUrlList()
crawler_create_urls = Crawler(self.settings)
crawler_create_urls.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler_create_urls.configure()
crawler_create_urls.crawl(spider)
crawler_create_urls.start()
reactor.run()
def crawl_urls(self):
spider = SomeSiteRetrievePages()
crawler_crawl_urls = Crawler(self.settings)
crawler_crawl_urls.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler_crawl_urls.configure()
crawler_crawl_urls.crawl(spider)
crawler_crawl_urls.start()
reactor.run()
当我实例化 class 时,我能够单独成功执行任一函数,但不幸的是,我无法同时执行它们:
from crawl.somesite import crawler_runner
cr = crawler_runner.CrawlerRunner()
cr.create_urls()
cr.crawl_urls()
第二个函数调用在尝试在 crawl_urls
函数中执行 reactor.run()
时生成 twisted.internet.error.ReactorNotRestartable
。
我想知道此代码是否有简单的修复方法(例如 运行 两个独立的 Twisted reactor 的某种方法),或者是否有更好的方法来构建此项目。
可以在一个反应器中 运行 多个蜘蛛,方法是让反应器保持打开状态,直到所有蜘蛛都停止 运行ning。这是通过保留所有 运行ning 蜘蛛的列表并且在该列表为空之前不执行 reactor.stop()
来实现的:
import sys
import os
from scrapy.utils.project import get_project_settings
from scrapy_somesite.spiders.create_urls_spider import Spider1
from scrapy_somesite.spiders.crawl_urls_spider import Spider2
from scrapy import signals, log
from twisted.internet import reactor
from scrapy.crawler import Crawler
class CrawlRunner:
def __init__(self):
self.running_crawlers = []
def spider_closing(self, spider):
log.msg("Spider closed: %s" % spider, level=log.INFO)
self.running_crawlers.remove(spider)
if not self.running_crawlers:
reactor.stop()
def run(self):
sys.path.append(os.path.join(os.path.curdir, "crawl/somesite"))
os.environ['SCRAPY_SETTINGS_MODULE'] = 'scrapy_somesite.settings'
settings = get_project_settings()
log.start(loglevel=log.DEBUG)
to_crawl = [Spider1, Spider2]
for spider in to_crawl:
crawler = Crawler(settings)
crawler_obj = spider()
self.running_crawlers.append(crawler_obj)
crawler.signals.connect(self.spider_closing, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(crawler_obj)
crawler.start()
reactor.run()
执行class:
from crawl.somesite.crawl import CrawlRunner
cr = CrawlRunner()
cr.run()
此解决方案基于 blogpost by Kiran Koduru。
我正在使用两阶段抓取的 Scrapy 汇总每日数据。第一阶段从索引页生成 URL 的列表,第二阶段将列表中每个 URL 的 HTML 写入 Kafka 主题。
虽然抓取的两个组件是相关的,但我希望它们是独立的:url_generator
将 运行 作为每天一次的计划任务,page_requester
会持续 运行,在可用时处理 URL。为了成为 "polite",我将调整 DOWNLOAD_DELAY
以便爬虫在 24 小时内完成,但对网站的负载最小。
我创建了一个 CrawlerRunner
class,它具有生成 URL 和检索 HTML 的功能:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy_somesite.spiders.create_urls_spider import CreateSomeSiteUrlList
from scrapy_somesite.spiders.crawl_urls_spider import SomeSiteRetrievePages
from scrapy.utils.project import get_project_settings
import os
import sys
class CrawlerRunner:
def __init__(self):
sys.path.append(os.path.join(os.path.curdir, "crawl/somesite"))
os.environ['SCRAPY_SETTINGS_MODULE'] = 'scrapy_somesite.settings'
self.settings = get_project_settings()
log.start()
def create_urls(self):
spider = CreateSomeSiteUrlList()
crawler_create_urls = Crawler(self.settings)
crawler_create_urls.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler_create_urls.configure()
crawler_create_urls.crawl(spider)
crawler_create_urls.start()
reactor.run()
def crawl_urls(self):
spider = SomeSiteRetrievePages()
crawler_crawl_urls = Crawler(self.settings)
crawler_crawl_urls.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler_crawl_urls.configure()
crawler_crawl_urls.crawl(spider)
crawler_crawl_urls.start()
reactor.run()
当我实例化 class 时,我能够单独成功执行任一函数,但不幸的是,我无法同时执行它们:
from crawl.somesite import crawler_runner
cr = crawler_runner.CrawlerRunner()
cr.create_urls()
cr.crawl_urls()
第二个函数调用在尝试在 crawl_urls
函数中执行 reactor.run()
时生成 twisted.internet.error.ReactorNotRestartable
。
我想知道此代码是否有简单的修复方法(例如 运行 两个独立的 Twisted reactor 的某种方法),或者是否有更好的方法来构建此项目。
可以在一个反应器中 运行 多个蜘蛛,方法是让反应器保持打开状态,直到所有蜘蛛都停止 运行ning。这是通过保留所有 运行ning 蜘蛛的列表并且在该列表为空之前不执行 reactor.stop()
来实现的:
import sys
import os
from scrapy.utils.project import get_project_settings
from scrapy_somesite.spiders.create_urls_spider import Spider1
from scrapy_somesite.spiders.crawl_urls_spider import Spider2
from scrapy import signals, log
from twisted.internet import reactor
from scrapy.crawler import Crawler
class CrawlRunner:
def __init__(self):
self.running_crawlers = []
def spider_closing(self, spider):
log.msg("Spider closed: %s" % spider, level=log.INFO)
self.running_crawlers.remove(spider)
if not self.running_crawlers:
reactor.stop()
def run(self):
sys.path.append(os.path.join(os.path.curdir, "crawl/somesite"))
os.environ['SCRAPY_SETTINGS_MODULE'] = 'scrapy_somesite.settings'
settings = get_project_settings()
log.start(loglevel=log.DEBUG)
to_crawl = [Spider1, Spider2]
for spider in to_crawl:
crawler = Crawler(settings)
crawler_obj = spider()
self.running_crawlers.append(crawler_obj)
crawler.signals.connect(self.spider_closing, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(crawler_obj)
crawler.start()
reactor.run()
执行class:
from crawl.somesite.crawl import CrawlRunner
cr = CrawlRunner()
cr.run()
此解决方案基于 blogpost by Kiran Koduru。