如何使用 Scrapy 将抓取的数据导出为可读 json

How to export scraped data as readable json using Scrapy

根据,我写了一个蜘蛛程序来将每个域保存到一个单独的json 文件中。我必须使用 CrawlSpider 才能使用 Rules 来访问子链接。

但是文件包含 json 无法被 pandas 读取的数据。它应该有一个漂亮且可读的新行分隔 json。但是 Scrapy 期望导出的 json 是 byte like.

所需的输出格式为:

{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}

我的spider.py:

import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse


DICT = {
    'quotes.toscrape.com': 'domain1.json',
    'stadt-koeln.de': 'domain2.json',
}


class PagingIncremental(CrawlSpider):
    name = "my_spider"

    allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']

    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
    ]

    custom_settings = {
        'DOWNLOAD_DELAY': '0',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DEPTH_LIMIT': '1',
        'AUTOTHROTTLE_ENABLED': 'True',
        'AUTOTHROTTLE_START_DELAY': '1',
        'AUTOTHROTTLE_MAX_DELAY': '3'
    }
    # Visit all found sublinks
    rules = (
        Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
    )

    def parse(self, response):

        item = {}

        # get domain from each sub page 
        domain = urlparse(response.url).netloc
        domain = domain.replace("www.", "")

        # if domain from DICT above matches with domain from subpage
        # all sublinks are stored in the same output file
        item["filename"] = DICT[domain]
        item["content"] = response.xpath("//p/text()").getall() 
        item['scrape_date'] = int(time.time())

        yield item


if __name__ == "__main__":
    process = CrawlerProcess(settings={
    })

    # process = CrawlerProcess()
    process.crawl(PagingIncremental)
    process.start()

我的pipelines.py:

from scrapy.exporters import JsonItemExporter

class SaveJsonPipeline:
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonItemExporter(open(filename, "ab")).export_item(item)

        return item

我的settings.py:

ITEM_PIPELINES = {
   '<project_name>.pipelines.SaveJsonPipeline': 300,
}

如果我使用 a 而不是 abpipelines.py 非二进制格式导出数据 Scrapy 说:

 JsonItemExporter(open(filename, "a")).export_item(item)
  File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
    self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes

任何想法和解决方案都有奖!

您应该使用 JsonLinesItemExporter 而不是 JsonItemExporter 来获取分隔行中的每个项目。

不要打扰 bytes 因为文档提到它必须在 bytes mode.

中打开文件

并且在 pandas.read_json() 中,您可以使用选项 lines=True 读取 JSONL (multiline-JSON):

df = pd.read_json('domain1.json', lines=True)

完整的工作代码。

所有代码都在一个文件中,因此每个人都可以简单地复制和测试它。

我使用 '__main__.SaveJsonPipeline' 从当前文件加载 class。

我还添加了代码以删除 content 中的空格并加入一个字符串:

" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()

import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter


class SaveJsonPipeline:
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonLinesItemExporter(open(filename, "ab")).export_item(item)

        return item


DICT = {
    'quotes.toscrape.com': 'domain1.json',
    'stadt-koeln.de': 'domain2.json',
}


class PagingIncremental(CrawlSpider):
    name = "my_spider"

    allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']

    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
    ]

    custom_settings = {
        'DOWNLOAD_DELAY': '0',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DEPTH_LIMIT': '1',
        'AUTOTHROTTLE_ENABLED': 'True',
        'AUTOTHROTTLE_START_DELAY': '1',
        'AUTOTHROTTLE_MAX_DELAY': '3'
    }
    # Visit all found sublinks
    rules = (
        Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
    )

    def parse(self, response):

        item = {}

        # get domain from each sub page 
        domain = urlparse(response.url).netloc
        domain = domain.replace("www.", "")

        # if domain from DICT above matches with domain from subpage
        # all sublinks are stored in the same output file
        item["filename"] = DICT[domain]
        #item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
        item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
        item['scrape_date'] = int(time.time())

        yield item


if __name__ == "__main__":
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1},  # used Pipeline create in current file (needs __main___)
    })

    # process = CrawlerProcess()
    process.crawl(PagingIncremental)
    process.start()

    import pandas as pd
    df = pd.read_json('domain1.json', lines=True)
    print(df.head())