如何组合 Scrapy 输出
How to combine Scrapy output
我是 Scrapy 的新手,这是我迄今为止最复杂的蜘蛛。
import scrapy
from scrapy.selector import HtmlXPathSelector
class CocabotSpider(scrapy.Spider):
name = 'cocabot'
start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
custom_settings = {
'FEED_URI' : 'output/cocaoutput.json'
}
def parse(self, response):
# follow links to concert pages
for href in response.css("div.search-img a::attr(href)"):
yield response.follow(href, self.parse_concert)
# follow links to venue pages
for href in response.css("span.venue-event a::attr(href)"):
yield response.follow(href, self.parse_venue)
# follow links to pagination pages
for href in response.css("li a.next.page-numbers::attr(href)"):
yield response.follow(href, self.parse)
def parse_concert(self, response):
def extract_with_css(query):
return response.css(query).extract_first()
yield {
'headliner' : extract_with_css("h1.p-ttl::text"),
'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
'event_url' : HtmlXPathSelector(response).select(
"//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
.extract_first(),
'event_coca_url' : response.request.url,
'date_time' : extract_with_css("ul.ind-time li::text"),
'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
}
def parse_venue(self, response):
yield {
'venue_website' : HtmlXPathSelector(response).select(
"//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
.extract_first(),
}
这是获取我想要的所有数据,但问题是 venue_website 数据在它自己的字典中。示例:
{"date_time": "Jun 18, 2018 at 08:00 am - 05:00 pm (Mon)", "event_url": "http://www.music.fsu.edu/Quicklinks/Summer-Music-Camps/EXPLORE-OUR-14-CAMPS/Jazz-Ensemble-Camp-for-Middle-School", "venue_coca_url": null, "venue_address": "122 N. Copeland St., Tallahassee, FL 32304", "price": "Registration for camp is now open. You can register online or by mailing in a registration form. Day Camper Price: 1.00 (Includes tuition only. No housing or meals.) Night Camper Price: 1.00 \u2013 (Includes tuition and housing with three meals per day). A 0.00 non-refundable deposit is due at registration. Balance of camp fees are due by June 4.", "venue": "FSU College of Music", "headliner": "Jazz Ensemble Camp for Middle School", "event_coca_url": "https://www.tallahasseearts.org/event/jazz-ensemble-camp-for-middle-school-3/"},
{"venue_website": "http://www.makinglightproductions.org/"},
{"venue_website": "http://www.mfbooks.us/"},
{"venue_website": null},
我如何设法将 venue_website 数据输入到我的主要 parse_concert 词典中?我试过在 parse_concert 函数中使用 follow 语句并使用 parse_venue 数据 return 而不是 yield,但我只是没有把它们放在一起。
在scrapy中生成需要多个页面的项目有两种方式:
- 请求链接。
由于您需要多个请求来生成一个项目,因此您需要将它们链接起来以便按顺序运行并随身携带您的数据:
def parse_concert(self, response):
concert = {'name': 'red hot chilly hotdogs'}
venue_url = 'http://someplace.com'
yield Request(venue_url, meta={'item': concert})
def parse_venue(self, response):
item = response.meta['item']
item['venue_name'] = 'someplace'
yield item
# {'name': 'red hot chilly hotdogs', 'venue_name': 'someplace'}
- Post 处理合并。
另一种解决方案是异步生成您的两种类型的项目,然后通过共享 ID 将它们合并:
def parse_concert(self, response):
concert = {'name': 'red hot chilly hotdogs', 'id': 1}
yield concert
yield Request(venue_url)
def parse_venue(self, response):
item = {'venue_name': 'someplace', 'id': 1}
yield item
然后结合替代脚本:
import json
with open('output.json') as f:
data = json.loads(f.read())
combined = {}
for item in data:
if item['id'] in combined:
combined[item['id']].update(item)
else:
combined[item['id']] = item
with open('output_combined.json', 'w') as f:
f.write(json.dumps(combined.values()))
这是我在上述答案的帮助下得到的结果:
import scrapy
from scrapy.selector import HtmlXPathSelector
class CocabotSpider(scrapy.Spider):
name = 'cocabot'
start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
custom_settings = {
'FEED_URI' : 'output/cocaoutput.json'
}
def parse(self, response):
# follow links to concert pages
for href in response.css("div.search-img a::attr(href)"):
yield response.follow(href, self.parse_concert)
# follow links to pagination pages
for href in response.css("li a.next.page-numbers::attr(href)"):
yield response.follow(href, self.parse)
def parse_concert(self, response):
def extract_with_css(query):
return response.css(query).extract_first()
concert = {
'headliner' : extract_with_css("h1.p-ttl::text"),
'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
'event_url' : HtmlXPathSelector(response).select(
"//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
.extract_first(),
'event_coca_url' : response.request.url,
'date_time' : extract_with_css("ul.ind-time li::text"),
'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
}
venue_coca_url = concert['venue_coca_url']
if venue_coca_url:
yield scrapy.Request(venue_coca_url, meta={'item': concert}, callback=self.parse_venue)
else:
yield concert
def parse_venue(self, response):
item = response.meta['item']
item['venue_website'] = HtmlXPathSelector(response).select(
"//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
.extract_first()
yield item
我是 Scrapy 的新手,这是我迄今为止最复杂的蜘蛛。
import scrapy
from scrapy.selector import HtmlXPathSelector
class CocabotSpider(scrapy.Spider):
name = 'cocabot'
start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
custom_settings = {
'FEED_URI' : 'output/cocaoutput.json'
}
def parse(self, response):
# follow links to concert pages
for href in response.css("div.search-img a::attr(href)"):
yield response.follow(href, self.parse_concert)
# follow links to venue pages
for href in response.css("span.venue-event a::attr(href)"):
yield response.follow(href, self.parse_venue)
# follow links to pagination pages
for href in response.css("li a.next.page-numbers::attr(href)"):
yield response.follow(href, self.parse)
def parse_concert(self, response):
def extract_with_css(query):
return response.css(query).extract_first()
yield {
'headliner' : extract_with_css("h1.p-ttl::text"),
'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
'event_url' : HtmlXPathSelector(response).select(
"//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
.extract_first(),
'event_coca_url' : response.request.url,
'date_time' : extract_with_css("ul.ind-time li::text"),
'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
}
def parse_venue(self, response):
yield {
'venue_website' : HtmlXPathSelector(response).select(
"//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
.extract_first(),
}
这是获取我想要的所有数据,但问题是 venue_website 数据在它自己的字典中。示例:
{"date_time": "Jun 18, 2018 at 08:00 am - 05:00 pm (Mon)", "event_url": "http://www.music.fsu.edu/Quicklinks/Summer-Music-Camps/EXPLORE-OUR-14-CAMPS/Jazz-Ensemble-Camp-for-Middle-School", "venue_coca_url": null, "venue_address": "122 N. Copeland St., Tallahassee, FL 32304", "price": "Registration for camp is now open. You can register online or by mailing in a registration form. Day Camper Price: 1.00 (Includes tuition only. No housing or meals.) Night Camper Price: 1.00 \u2013 (Includes tuition and housing with three meals per day). A 0.00 non-refundable deposit is due at registration. Balance of camp fees are due by June 4.", "venue": "FSU College of Music", "headliner": "Jazz Ensemble Camp for Middle School", "event_coca_url": "https://www.tallahasseearts.org/event/jazz-ensemble-camp-for-middle-school-3/"},
{"venue_website": "http://www.makinglightproductions.org/"},
{"venue_website": "http://www.mfbooks.us/"},
{"venue_website": null},
我如何设法将 venue_website 数据输入到我的主要 parse_concert 词典中?我试过在 parse_concert 函数中使用 follow 语句并使用 parse_venue 数据 return 而不是 yield,但我只是没有把它们放在一起。
在scrapy中生成需要多个页面的项目有两种方式:
- 请求链接。
由于您需要多个请求来生成一个项目,因此您需要将它们链接起来以便按顺序运行并随身携带您的数据:
def parse_concert(self, response):
concert = {'name': 'red hot chilly hotdogs'}
venue_url = 'http://someplace.com'
yield Request(venue_url, meta={'item': concert})
def parse_venue(self, response):
item = response.meta['item']
item['venue_name'] = 'someplace'
yield item
# {'name': 'red hot chilly hotdogs', 'venue_name': 'someplace'}
- Post 处理合并。
另一种解决方案是异步生成您的两种类型的项目,然后通过共享 ID 将它们合并:
def parse_concert(self, response):
concert = {'name': 'red hot chilly hotdogs', 'id': 1}
yield concert
yield Request(venue_url)
def parse_venue(self, response):
item = {'venue_name': 'someplace', 'id': 1}
yield item
然后结合替代脚本:
import json
with open('output.json') as f:
data = json.loads(f.read())
combined = {}
for item in data:
if item['id'] in combined:
combined[item['id']].update(item)
else:
combined[item['id']] = item
with open('output_combined.json', 'w') as f:
f.write(json.dumps(combined.values()))
这是我在上述答案的帮助下得到的结果:
import scrapy
from scrapy.selector import HtmlXPathSelector
class CocabotSpider(scrapy.Spider):
name = 'cocabot'
start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
custom_settings = {
'FEED_URI' : 'output/cocaoutput.json'
}
def parse(self, response):
# follow links to concert pages
for href in response.css("div.search-img a::attr(href)"):
yield response.follow(href, self.parse_concert)
# follow links to pagination pages
for href in response.css("li a.next.page-numbers::attr(href)"):
yield response.follow(href, self.parse)
def parse_concert(self, response):
def extract_with_css(query):
return response.css(query).extract_first()
concert = {
'headliner' : extract_with_css("h1.p-ttl::text"),
'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
'event_url' : HtmlXPathSelector(response).select(
"//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
.extract_first(),
'event_coca_url' : response.request.url,
'date_time' : extract_with_css("ul.ind-time li::text"),
'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
}
venue_coca_url = concert['venue_coca_url']
if venue_coca_url:
yield scrapy.Request(venue_coca_url, meta={'item': concert}, callback=self.parse_venue)
else:
yield concert
def parse_venue(self, response):
item = response.meta['item']
item['venue_website'] = HtmlXPathSelector(response).select(
"//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
.extract_first()
yield item