运行 每个进程有多个蜘蛛 - 没有保存输出数据
Running multiple spiders per one process - no output data saved
请帮忙!
尝试使用 Scrapy 创建货币解析器。
创建了两个蜘蛛,如果 运行 它们分开,它们可以正常工作。
尝试每个进程 运行 两个蜘蛛 - 在输出中不提供任何数据。
无论是尝试将其保存到 txt、json 还是数据库。
使用 MySQL 数据库。
但是 运行每个进程设置一个蜘蛛 - 成功保存数据。
将数据保存到数据库或其他任何东西的唯一方法是 运行 scrapy crawl Liga && scrapy crawl IFinance
.
主要蜘蛛代码:
from scrapy.selector import Selector
from scrapy.crawler import CrawlerProcess
from ..items import CurparserItem
class LigaSpider(Spider):
name = "Liga"
allowed_domains = ["finance.liga.net"]
start_urls = [
"https://finance.liga.net/currency/nbu",
]
def parse(self, response):
cur_list = Selector(response).xpath('//table[contains(@class, "default-table-finance course-two-col")]/tbody/tr')
for currency in cur_list:
item = CurparserItem()
cur_code_raw = currency.xpath('./td[1]/a/text()').extract_first()
item['cur_code'] = " ".join(cur_code_raw.split())
item['cur_name'] = currency.xpath('./td[2]/text()').extract_first()
multiple_value = currency.xpath('./td[4]/div/text()').extract_first()
hrn_points = currency.xpath('./td[3]/text()').extract_first()
item['cur_value'] = round(float(multiple_value)/int(hrn_points), 3)
item['cur_behavior'] = currency.xpath('./td[4]/span/text()').extract_first()
yield item
class IFinance(Spider):
name = "IFinance"
allowed_domains = ["finance.i.ua"]
start_urls = [
"https://finance.i.ua/nbu/",
]
def parse(self, response):
cur_list = Selector(response).xpath('//table[contains(@class, "table-data")]/tbody/tr')
for currency in cur_list:
item = CurparserItem()
item['cur_code'] = currency.xpath('./th/text()').extract_first()
item['cur_name'] = currency.xpath('./td[2]/text()').extract_first()
raw_behav_sign = currency.xpath('./td[3]/span/@class').extract_first()
behav_sign = ""
if raw_behav_sign == "value -increase":
behav_sign = "+ "
elif raw_behav_sign == "value -decrease":
behav_sign = "- "
else:
behav_sign = ""
item['cur_value'] = currency.xpath('./td[3]/span/span[1]/text()').extract_first()
raw_behav_value = currency.xpath('./td[3]/span/span[2]/text()').extract_first()
item['cur_behavior'] = behav_sign + raw_behav_value
yield item
process = CrawlerProcess()
process.crawl(LigaSpider)
process.crawl(IFinance)
process.start()
管道文件:
import mysql.connector
class CurparserPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.spider.name)
def __init__(self, spider_name):
self.create_connection()
self.create_table(spider_name)
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='3480stfgDev',
database='currencydb'
)
self.curr = self.conn.cursor()
def create_table(self, spider_name):
self.curr.execute("""DROP TABLE IF EXISTS %s""" % spider_name)
self.curr.execute("""create table %s(
cur_code text,
cur_name text,
cur_value float,
cur_behavior text
)""" % spider_name)
def process_item(self, item, spider):
self.store_db(item, spider)
return item
def store_db(self, item, spider):
spider_name = spider.name
self.curr.execute("""insert into """ + spider_name + """ values (%s,%s,%s,%s)""", (
item['cur_code'],
item['cur_name'],
item['cur_value'],
item['cur_behavior']
))
self.conn.commit()```
当您使用 CrawlerProcess
时,不使用项目设置(即来自项目文件夹中的 settings.py
文件)。
有关如何传递项目设置的信息,请参阅https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
请帮忙! 尝试使用 Scrapy 创建货币解析器。 创建了两个蜘蛛,如果 运行 它们分开,它们可以正常工作。 尝试每个进程 运行 两个蜘蛛 - 在输出中不提供任何数据。 无论是尝试将其保存到 txt、json 还是数据库。
使用 MySQL 数据库。
但是 运行每个进程设置一个蜘蛛 - 成功保存数据。
将数据保存到数据库或其他任何东西的唯一方法是 运行 scrapy crawl Liga && scrapy crawl IFinance
.
主要蜘蛛代码:
from scrapy.selector import Selector
from scrapy.crawler import CrawlerProcess
from ..items import CurparserItem
class LigaSpider(Spider):
name = "Liga"
allowed_domains = ["finance.liga.net"]
start_urls = [
"https://finance.liga.net/currency/nbu",
]
def parse(self, response):
cur_list = Selector(response).xpath('//table[contains(@class, "default-table-finance course-two-col")]/tbody/tr')
for currency in cur_list:
item = CurparserItem()
cur_code_raw = currency.xpath('./td[1]/a/text()').extract_first()
item['cur_code'] = " ".join(cur_code_raw.split())
item['cur_name'] = currency.xpath('./td[2]/text()').extract_first()
multiple_value = currency.xpath('./td[4]/div/text()').extract_first()
hrn_points = currency.xpath('./td[3]/text()').extract_first()
item['cur_value'] = round(float(multiple_value)/int(hrn_points), 3)
item['cur_behavior'] = currency.xpath('./td[4]/span/text()').extract_first()
yield item
class IFinance(Spider):
name = "IFinance"
allowed_domains = ["finance.i.ua"]
start_urls = [
"https://finance.i.ua/nbu/",
]
def parse(self, response):
cur_list = Selector(response).xpath('//table[contains(@class, "table-data")]/tbody/tr')
for currency in cur_list:
item = CurparserItem()
item['cur_code'] = currency.xpath('./th/text()').extract_first()
item['cur_name'] = currency.xpath('./td[2]/text()').extract_first()
raw_behav_sign = currency.xpath('./td[3]/span/@class').extract_first()
behav_sign = ""
if raw_behav_sign == "value -increase":
behav_sign = "+ "
elif raw_behav_sign == "value -decrease":
behav_sign = "- "
else:
behav_sign = ""
item['cur_value'] = currency.xpath('./td[3]/span/span[1]/text()').extract_first()
raw_behav_value = currency.xpath('./td[3]/span/span[2]/text()').extract_first()
item['cur_behavior'] = behav_sign + raw_behav_value
yield item
process = CrawlerProcess()
process.crawl(LigaSpider)
process.crawl(IFinance)
process.start()
管道文件:
import mysql.connector
class CurparserPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.spider.name)
def __init__(self, spider_name):
self.create_connection()
self.create_table(spider_name)
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='3480stfgDev',
database='currencydb'
)
self.curr = self.conn.cursor()
def create_table(self, spider_name):
self.curr.execute("""DROP TABLE IF EXISTS %s""" % spider_name)
self.curr.execute("""create table %s(
cur_code text,
cur_name text,
cur_value float,
cur_behavior text
)""" % spider_name)
def process_item(self, item, spider):
self.store_db(item, spider)
return item
def store_db(self, item, spider):
spider_name = spider.name
self.curr.execute("""insert into """ + spider_name + """ values (%s,%s,%s,%s)""", (
item['cur_code'],
item['cur_name'],
item['cur_value'],
item['cur_behavior']
))
self.conn.commit()```
当您使用 CrawlerProcess
时,不使用项目设置(即来自项目文件夹中的 settings.py
文件)。
有关如何传递项目设置的信息,请参阅https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())