如何使用 Scrapy 将抓取的数据导出为可读 json
How to export scraped data as readable json using Scrapy
根据,我写了一个蜘蛛程序来将每个域保存到一个单独的json 文件中。我必须使用 CrawlSpider
才能使用 Rules
来访问子链接。
但是文件包含 json
无法被 pandas
读取的数据。它应该有一个漂亮且可读的新行分隔 json。但是 Scrapy 期望导出的 json 是 byte like.
所需的输出格式为:
{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}
我的spider.py:
import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
item["content"] = response.xpath("//p/text()").getall()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess(settings={
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
我的pipelines.py
:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonItemExporter(open(filename, "ab")).export_item(item)
return item
我的settings.py
:
ITEM_PIPELINES = {
'<project_name>.pipelines.SaveJsonPipeline': 300,
}
如果我使用 a
而不是 ab
以 pipelines.py
非二进制格式导出数据 Scrapy 说:
JsonItemExporter(open(filename, "a")).export_item(item)
File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes
任何想法和解决方案都有奖!
您应该使用 JsonLinesItemExporter 而不是 JsonItemExporter
来获取分隔行中的每个项目。
不要打扰 bytes
因为文档提到它必须在 bytes mode
.
中打开文件
并且在 pandas.read_json() 中,您可以使用选项 lines=True
读取 JSONL
(multiline-JSON):
df = pd.read_json('domain1.json', lines=True)
完整的工作代码。
所有代码都在一个文件中,因此每个人都可以简单地复制和测试它。
我使用 '__main__.SaveJsonPipeline'
从当前文件加载 class。
我还添加了代码以删除 content
中的空格并加入一个字符串:
" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonLinesItemExporter(open(filename, "ab")).export_item(item)
return item
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
#item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1}, # used Pipeline create in current file (needs __main___)
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
import pandas as pd
df = pd.read_json('domain1.json', lines=True)
print(df.head())
根据CrawlSpider
才能使用 Rules
来访问子链接。
但是文件包含 json
无法被 pandas
读取的数据。它应该有一个漂亮且可读的新行分隔 json。但是 Scrapy 期望导出的 json 是 byte like.
所需的输出格式为:
{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}
我的spider.py:
import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
item["content"] = response.xpath("//p/text()").getall()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess(settings={
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
我的pipelines.py
:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonItemExporter(open(filename, "ab")).export_item(item)
return item
我的settings.py
:
ITEM_PIPELINES = {
'<project_name>.pipelines.SaveJsonPipeline': 300,
}
如果我使用 a
而不是 ab
以 pipelines.py
非二进制格式导出数据 Scrapy 说:
JsonItemExporter(open(filename, "a")).export_item(item)
File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes
任何想法和解决方案都有奖!
您应该使用 JsonLinesItemExporter 而不是 JsonItemExporter
来获取分隔行中的每个项目。
不要打扰 bytes
因为文档提到它必须在 bytes mode
.
并且在 pandas.read_json() 中,您可以使用选项 lines=True
读取 JSONL
(multiline-JSON):
df = pd.read_json('domain1.json', lines=True)
完整的工作代码。
所有代码都在一个文件中,因此每个人都可以简单地复制和测试它。
我使用 '__main__.SaveJsonPipeline'
从当前文件加载 class。
我还添加了代码以删除 content
中的空格并加入一个字符串:
" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonLinesItemExporter(open(filename, "ab")).export_item(item)
return item
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
#item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1}, # used Pipeline create in current file (needs __main___)
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
import pandas as pd
df = pd.read_json('domain1.json', lines=True)
print(df.head())