无法产生由项目管道进行的并行请求
Can't yield paralel requests conducted by items pipeline
在我的 scrapy 代码中,我试图从列出所有议会成员 (MP) 的议会网站生成以下数据。打开每个 MP 的 links,我正在发出并行请求以获取我要计算的数字。我在这里没有使用 metas,因为我的代码不仅发出连续的请求,而且在请求 MP 的单个页面后对数字发出并行请求。因此我认为物品容器更适合我的目的。
这是我要抓取的数字
- 每位国会议员签名的议案数量
- 每位议员签名的问题提案数量
- 每位国会议员在议会中发言的次数
为了计算并得出每位议员签名的法案数量,我正在尝试编写一个关于议员的 3 层爬虫:
- 从列出所有议员的 link 开始
- 从(1)访问每个MP的单独页面,其中显示上面定义的三个信息
- 3a) 请求包含提案的页面并通过 len 函数计算提案的数量
3b) 请求带有问题建议的页面并通过 len 函数计算它们的数量
3c) 请求带有演讲的页面并通过 len 函数计算演讲的数量
我要的是:我要出示3a,3b,3c的查询,有议员的名字和党派
问题:我上面的代码除了为每个请求生成空字典外什么也没有生成
注意:因为我的解析函数不像 parse => parse2 => parse3 那样工作,而是在 parse2 之后有 3 个并行解析函数,所以我没有使用 meta,因为我没有产生所有值解析三。因此我更喜欢使用显然不起作用的管道。
主要代码:
'''
from scrapy import Spider
from scrapy.http import Request
from ..items import MeclisItem
import logging
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
items = MeclisItem()
mv_list = mv_list = response.xpath("//ul[@class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
items['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
items['party'] = mv.xpath("./li/div/div[@class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[@class="col-md-8"]/a/@href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback = self.mv_analysis)
pass
def mv_analysis(self, response):
items = MeclisItem()
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/@href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/@href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/@href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link, callback = self.bill_prop_counter) #number of bill proposals to be requested
yield Request(questionprop_link, callback = self.quest_prop_counter) #number of question propoesals to be requested
yield Request(speech_link, callback = self.speech_counter) #number of speeches to be requested
yield items
# COUNTING FUNCTIONS
def bill_prop_counter(self,response):
items = MeclisItem()
billproposals = response.xpath("//tr[@valign='TOP']")
items['bill_prop_count'] = len(billproposals)
pass
def quest_prop_counter(self, response):
items = MeclisItem()
questionproposals = response.xpath("//tr[@valign='TOP']")
items['res_prop_count'] = len(questionproposals)
pass
def speech_counter(self, response):
items = MeclisItem()
speeches = response.xpath("//tr[@valign='TOP']")
items['speech_count'] = len(speeches)
pass
'''
items.py代码:
import scrapy
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
pass
scrapy 显示的内容:
我查了很多关于Whosebug的问题,还是没找到出路。提前致谢。
ps: 单独花了十分钟给上面的代码上色,也没能成功:(
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three.
你可以这样做:
编辑:
import scrapy
from scrapy import Spider
from scrapy.http import Request
# from ..items import MeclisItem
import logging
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
mv_list = mv_list = response.xpath("//ul[@class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
item = MeclisItem()
item['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
item['party'] = mv.xpath("./li/div/div[@class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[@class="col-md-8"]/a/@href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback=self.mv_analysis, cb_kwargs={'item': item})
def mv_analysis(self, response, item):
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/@href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/@href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/@href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link,
callback=self.bill_prop_counter,
cb_kwargs={'item': item, 'questionprop_link': questionprop_link, 'speech_link': speech_link}) #number of bill proposals to be requested
# COUNTING FUNCTIONS
def bill_prop_counter(self, response, item, questionprop_link, speech_link):
billproposals = response.xpath("//tr[@valign='TOP']")
item['bill_prop_count'] = len(billproposals)
yield Request(questionprop_link,
callback=self.quest_prop_counter,
cb_kwargs={'item': item, 'speech_link': speech_link}) #number of question propoesals to be requested
def quest_prop_counter(self, response, item, speech_link):
questionproposals = response.xpath("//tr[@valign='TOP']")
item['res_prop_count'] = len(questionproposals)
yield Request(speech_link,
callback=self.speech_counter,
cb_kwargs={'item': item}) #number of speeches to be requested
def speech_counter(self, response, item):
speeches = response.xpath("//tr[@valign='TOP']")
item['speech_count'] = len(speeches)
yield item
在我的 scrapy 代码中,我试图从列出所有议会成员 (MP) 的议会网站生成以下数据。打开每个 MP 的 links,我正在发出并行请求以获取我要计算的数字。我在这里没有使用 metas,因为我的代码不仅发出连续的请求,而且在请求 MP 的单个页面后对数字发出并行请求。因此我认为物品容器更适合我的目的。
这是我要抓取的数字
- 每位国会议员签名的议案数量
- 每位议员签名的问题提案数量
- 每位国会议员在议会中发言的次数
为了计算并得出每位议员签名的法案数量,我正在尝试编写一个关于议员的 3 层爬虫:
- 从列出所有议员的 link 开始
- 从(1)访问每个MP的单独页面,其中显示上面定义的三个信息
- 3a) 请求包含提案的页面并通过 len 函数计算提案的数量 3b) 请求带有问题建议的页面并通过 len 函数计算它们的数量 3c) 请求带有演讲的页面并通过 len 函数计算演讲的数量
我要的是:我要出示3a,3b,3c的查询,有议员的名字和党派
问题:我上面的代码除了为每个请求生成空字典外什么也没有生成
注意:因为我的解析函数不像 parse => parse2 => parse3 那样工作,而是在 parse2 之后有 3 个并行解析函数,所以我没有使用 meta,因为我没有产生所有值解析三。因此我更喜欢使用显然不起作用的管道。
主要代码:
'''
from scrapy import Spider
from scrapy.http import Request
from ..items import MeclisItem
import logging
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
items = MeclisItem()
mv_list = mv_list = response.xpath("//ul[@class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
items['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
items['party'] = mv.xpath("./li/div/div[@class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[@class="col-md-8"]/a/@href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback = self.mv_analysis)
pass
def mv_analysis(self, response):
items = MeclisItem()
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/@href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/@href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/@href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link, callback = self.bill_prop_counter) #number of bill proposals to be requested
yield Request(questionprop_link, callback = self.quest_prop_counter) #number of question propoesals to be requested
yield Request(speech_link, callback = self.speech_counter) #number of speeches to be requested
yield items
# COUNTING FUNCTIONS
def bill_prop_counter(self,response):
items = MeclisItem()
billproposals = response.xpath("//tr[@valign='TOP']")
items['bill_prop_count'] = len(billproposals)
pass
def quest_prop_counter(self, response):
items = MeclisItem()
questionproposals = response.xpath("//tr[@valign='TOP']")
items['res_prop_count'] = len(questionproposals)
pass
def speech_counter(self, response):
items = MeclisItem()
speeches = response.xpath("//tr[@valign='TOP']")
items['speech_count'] = len(speeches)
pass
'''
items.py代码:
import scrapy
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
pass
scrapy 显示的内容:
我查了很多关于Whosebug的问题,还是没找到出路。提前致谢。
ps: 单独花了十分钟给上面的代码上色,也没能成功:(
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three.
你可以这样做:
编辑:
import scrapy
from scrapy import Spider
from scrapy.http import Request
# from ..items import MeclisItem
import logging
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
mv_list = mv_list = response.xpath("//ul[@class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
item = MeclisItem()
item['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
item['party'] = mv.xpath("./li/div/div[@class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[@class="col-md-8"]/a/@href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback=self.mv_analysis, cb_kwargs={'item': item})
def mv_analysis(self, response, item):
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/@href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/@href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/@href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link,
callback=self.bill_prop_counter,
cb_kwargs={'item': item, 'questionprop_link': questionprop_link, 'speech_link': speech_link}) #number of bill proposals to be requested
# COUNTING FUNCTIONS
def bill_prop_counter(self, response, item, questionprop_link, speech_link):
billproposals = response.xpath("//tr[@valign='TOP']")
item['bill_prop_count'] = len(billproposals)
yield Request(questionprop_link,
callback=self.quest_prop_counter,
cb_kwargs={'item': item, 'speech_link': speech_link}) #number of question propoesals to be requested
def quest_prop_counter(self, response, item, speech_link):
questionproposals = response.xpath("//tr[@valign='TOP']")
item['res_prop_count'] = len(questionproposals)
yield Request(speech_link,
callback=self.speech_counter,
cb_kwargs={'item': item}) #number of speeches to be requested
def speech_counter(self, response, item):
speeches = response.xpath("//tr[@valign='TOP']")
item['speech_count'] = len(speeches)
yield item