我怎样才能让这个 Spider 为每个项目列表导出一个 JSON 文件?
How can I get this Spider to export a JSON file for each Items List?
在我的以下文件 Reddit.py
中,它有这个 Spider:
import scrapy
class RedditSpider(scrapy.Spider):
name = 'Reddit'
allowed_domains = ['reddit.com']
start_urls = ['https://old.reddit.com']
def parse(self, response):
for link in response.css('li.first a.comments::attr(href)').extract():
yield scrapy.Request(url=response.urljoin(link), callback=self.parse_topics)
def parse_topics(self, response):
topics = {}
topics["title"] = response.css('a.title::text').extract_first()
topics["author"] = response.css('p.tagline a.author::text').extract_first()
if response.css('div.score.likes::attr(title)').extract_first() is not None:
topics["score"] = response.css('div.score.likes::attr(title)').extract_first()
else:
topics["score"] = "0"
if int(topics["score"]) > 10000:
author_url = response.css('p.tagline a.author::attr(href)').extract_first()
yield scrapy.Request(url=response.urljoin(author_url), callback=self.parse_user, meta={'topics': topics})
else:
yield topics
def parse_user(self, response):
topics = response.meta.get('topics')
users = {}
users["name"] = topics["author"]
users["karma"] = response.css('span.karma::text').extract_first()
yield users
yield topics
它从 old.reddit
的主页获取所有 URL 的作用,然后抓取每个 URL 的 title, author and score.
我添加的是第二部分,它检查 score 是否高于 10000,如果它是,然后 Spider 转到 user 的页面并从中刮取他的 karma。
我知道我可以从 topic 的页面上抓取 karma,但我想这样做, 因为 user 页面的其他部分我抓取了 topic 页面中不存在的部分。
我想要做的是将包含 title, author, score
的 topics
列表导出到名为 topics.json
的 JSON
文件中,然后如果 topic 的分数高于 10000 以将包含 name, karma
的 users
列表导出到名为 JSON
的文件中25=].
我只知道
的command-line
scrapy runspider Reddit.py -o Reddit.json
这会将所有列表导出到一个名为 Reddit
的单个 JSON
文件中,但结构很糟糕
[
{"name": "Username", "karma": "00000"},
{"title": "ExampleTitle1", "author": "Username", "score": "11000"},
{"name": "Username2", "karma": "00000"},
{"title": "ExampleTitle2", "author": "Username2", "score": "12000"},
{"name": "Username3", "karma": "00000"},
{"title": "ExampleTitle3", "author": "Username3", "score": "13000"},
{"title": "ExampleTitle4", "author": "Username4", "score": "9000"},
....
]
我完全不知道 Scrapy 的 Item Pipeline
也不知道 Item Exporters
和 Feed Exporters
如何在我的计算机上实现它们Spider,或者如何整体使用它们,试图从文档中理解它,但我似乎并不知道如何在我的 Spider 中使用它。
我想要的最终结果是两个文件:
topics.json
[
{"title": "ExampleTitle1", "author": "Username", "score": "11000"},
{"title": "ExampleTitle2", "author": "Username2", "score": "12000"},
{"title": "ExampleTitle3", "author": "Username3", "score": "13000"},
{"title": "ExampleTitle4", "author": "Username4", "score": "9000"},
....
]
users.json
[
{"name": "Username", "karma": "00000"},
{"name": "Username2", "karma": "00000"},
{"name": "Username3", "karma": "00000"},
....
]
同时删除列表中的重复项。
蜘蛛在抓取用户页面时产生了两个项目。如果满足以下条件,它可能会起作用:
def parse_user(self, response):
topics = response.meta.get('topics')
users = {}
users["name"] = topics["author"]
users["karma"] = response.css('span.karma::text').extract_first()
topics["users"] = users
yield topics
您可以根据需要 post 处理 JSON。
顺便说一句,我不明白你为什么在处理单个元素(单个"topic")时使用复数("topics")。
从下面的 SO 线程应用方法
我创建了一个样本抓取工具
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
def parse(self, response):
yield {"type": "unknown item"}
yield {"title": "ExampleTitle1", "author": "Username", "score": "11000"}
yield {"name": "Username", "karma": "00000"}
yield {"name": "Username2", "karma": "00000"}
yield {"someothertype": "unknown item"}
yield {"title": "ExampleTitle2", "author": "Username2", "score": "12000"}
yield {"title": "ExampleTitle3", "author": "Username3", "score": "13000"}
yield {"title": "ExampleTitle4", "author": "Username4", "score": "9000"}
yield {"name": "Username3", "karma": "00000"}
然后在exporters.py
from scrapy.exporters import JsonItemExporter
from scrapy.extensions.feedexport import FileFeedStorage
class JsonMultiFileItemExporter(JsonItemExporter):
types = ["topics", "users"]
def __init__(self, file, **kwargs):
super().__init__(file, **kwargs)
self.files = {}
self.kwargs = kwargs
for itemtype in self.types:
storage = FileFeedStorage(itemtype + ".json")
file = storage.open(None)
self.files[itemtype] = JsonItemExporter(file, **self.kwargs)
def start_exporting(self):
super().start_exporting()
for exporters in self.files.values():
exporters.start_exporting()
def finish_exporting(self):
super().finish_exporting()
for exporters in self.files.values():
exporters.finish_exporting()
exporters.file.close()
def export_item(self, item):
if "title" in item:
itemtype = "topics"
elif "karma" in item:
itemtype = "users"
else:
itemtype = "self"
if itemtype == "self" or itemtype not in self.files:
super().export_item(item)
else:
self.files[itemtype].export_item(item)
在下面添加 settings.py
FEED_EXPORTERS = {
'json': 'testing.exporters.JsonMultiFileItemExporter',
}
运行 爬虫我生成了 3 个文件
example.json
[
{"type": "unknown item"},
{"someothertype": "unknown item"}
]
topics.json
[
{"title": "ExampleTitle1", "author": "Username", "score": "11000"},
{"title": "ExampleTitle2", "author": "Username2", "score": "12000"},
{"title": "ExampleTitle3", "author": "Username3", "score": "13000"},
{"title": "ExampleTitle4", "author": "Username4", "score": "9000"}
]
users.json
[
{"name": "Username", "karma": "00000"},
{"name": "Username2", "karma": "00000"},
{"name": "Username3", "karma": "00000"}
]
在我的以下文件 Reddit.py
中,它有这个 Spider:
import scrapy
class RedditSpider(scrapy.Spider):
name = 'Reddit'
allowed_domains = ['reddit.com']
start_urls = ['https://old.reddit.com']
def parse(self, response):
for link in response.css('li.first a.comments::attr(href)').extract():
yield scrapy.Request(url=response.urljoin(link), callback=self.parse_topics)
def parse_topics(self, response):
topics = {}
topics["title"] = response.css('a.title::text').extract_first()
topics["author"] = response.css('p.tagline a.author::text').extract_first()
if response.css('div.score.likes::attr(title)').extract_first() is not None:
topics["score"] = response.css('div.score.likes::attr(title)').extract_first()
else:
topics["score"] = "0"
if int(topics["score"]) > 10000:
author_url = response.css('p.tagline a.author::attr(href)').extract_first()
yield scrapy.Request(url=response.urljoin(author_url), callback=self.parse_user, meta={'topics': topics})
else:
yield topics
def parse_user(self, response):
topics = response.meta.get('topics')
users = {}
users["name"] = topics["author"]
users["karma"] = response.css('span.karma::text').extract_first()
yield users
yield topics
它从 old.reddit
的主页获取所有 URL 的作用,然后抓取每个 URL 的 title, author and score.
我添加的是第二部分,它检查 score 是否高于 10000,如果它是,然后 Spider 转到 user 的页面并从中刮取他的 karma。
我知道我可以从 topic 的页面上抓取 karma,但我想这样做, 因为 user 页面的其他部分我抓取了 topic 页面中不存在的部分。
我想要做的是将包含 title, author, score
的 topics
列表导出到名为 topics.json
的 JSON
文件中,然后如果 topic 的分数高于 10000 以将包含 name, karma
的 users
列表导出到名为 JSON
的文件中25=].
我只知道
的command-line
scrapy runspider Reddit.py -o Reddit.json
这会将所有列表导出到一个名为 Reddit
的单个 JSON
文件中,但结构很糟糕
[
{"name": "Username", "karma": "00000"},
{"title": "ExampleTitle1", "author": "Username", "score": "11000"},
{"name": "Username2", "karma": "00000"},
{"title": "ExampleTitle2", "author": "Username2", "score": "12000"},
{"name": "Username3", "karma": "00000"},
{"title": "ExampleTitle3", "author": "Username3", "score": "13000"},
{"title": "ExampleTitle4", "author": "Username4", "score": "9000"},
....
]
我完全不知道 Scrapy 的 Item Pipeline
也不知道 Item Exporters
和 Feed Exporters
如何在我的计算机上实现它们Spider,或者如何整体使用它们,试图从文档中理解它,但我似乎并不知道如何在我的 Spider 中使用它。
我想要的最终结果是两个文件:
topics.json
[
{"title": "ExampleTitle1", "author": "Username", "score": "11000"},
{"title": "ExampleTitle2", "author": "Username2", "score": "12000"},
{"title": "ExampleTitle3", "author": "Username3", "score": "13000"},
{"title": "ExampleTitle4", "author": "Username4", "score": "9000"},
....
]
users.json
[
{"name": "Username", "karma": "00000"},
{"name": "Username2", "karma": "00000"},
{"name": "Username3", "karma": "00000"},
....
]
同时删除列表中的重复项。
蜘蛛在抓取用户页面时产生了两个项目。如果满足以下条件,它可能会起作用:
def parse_user(self, response):
topics = response.meta.get('topics')
users = {}
users["name"] = topics["author"]
users["karma"] = response.css('span.karma::text').extract_first()
topics["users"] = users
yield topics
您可以根据需要 post 处理 JSON。
顺便说一句,我不明白你为什么在处理单个元素(单个"topic")时使用复数("topics")。
从下面的 SO 线程应用方法
我创建了一个样本抓取工具
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
def parse(self, response):
yield {"type": "unknown item"}
yield {"title": "ExampleTitle1", "author": "Username", "score": "11000"}
yield {"name": "Username", "karma": "00000"}
yield {"name": "Username2", "karma": "00000"}
yield {"someothertype": "unknown item"}
yield {"title": "ExampleTitle2", "author": "Username2", "score": "12000"}
yield {"title": "ExampleTitle3", "author": "Username3", "score": "13000"}
yield {"title": "ExampleTitle4", "author": "Username4", "score": "9000"}
yield {"name": "Username3", "karma": "00000"}
然后在exporters.py
from scrapy.exporters import JsonItemExporter
from scrapy.extensions.feedexport import FileFeedStorage
class JsonMultiFileItemExporter(JsonItemExporter):
types = ["topics", "users"]
def __init__(self, file, **kwargs):
super().__init__(file, **kwargs)
self.files = {}
self.kwargs = kwargs
for itemtype in self.types:
storage = FileFeedStorage(itemtype + ".json")
file = storage.open(None)
self.files[itemtype] = JsonItemExporter(file, **self.kwargs)
def start_exporting(self):
super().start_exporting()
for exporters in self.files.values():
exporters.start_exporting()
def finish_exporting(self):
super().finish_exporting()
for exporters in self.files.values():
exporters.finish_exporting()
exporters.file.close()
def export_item(self, item):
if "title" in item:
itemtype = "topics"
elif "karma" in item:
itemtype = "users"
else:
itemtype = "self"
if itemtype == "self" or itemtype not in self.files:
super().export_item(item)
else:
self.files[itemtype].export_item(item)
在下面添加 settings.py
FEED_EXPORTERS = {
'json': 'testing.exporters.JsonMultiFileItemExporter',
}
运行 爬虫我生成了 3 个文件
example.json
[
{"type": "unknown item"},
{"someothertype": "unknown item"}
]
topics.json
[
{"title": "ExampleTitle1", "author": "Username", "score": "11000"},
{"title": "ExampleTitle2", "author": "Username2", "score": "12000"},
{"title": "ExampleTitle3", "author": "Username3", "score": "13000"},
{"title": "ExampleTitle4", "author": "Username4", "score": "9000"}
]
users.json
[
{"name": "Username", "karma": "00000"},
{"name": "Username2", "karma": "00000"},
{"name": "Username3", "karma": "00000"}
]