JQ:使用嵌套数组有效过滤大量 json

JQ: Filtering a massive json with nested arrays efficiently

我有一个巨大的 json 文件,其中包含许多无关的字段,我需要 trim 使其只包含某些字段。我在使用 jq 处理文件方面取得了一些进展,但在从 json 文件中的嵌套数组中提取信息时,我 运行 遇到了一些困难,并且 none 我找到的解决方案似乎对我有用。

我的 json 数据如下所示:

{
  "results": [
    {
      "url": "https://someresult.com",
      "id": 5192740,
      "external_id": null,
      "via": {
        "channel": "web",
        "source": {
          "from": {},
          "to": {},
          "rel": null
        }
      },
      "created_at": "2022-04-29T15:19:37Z",
      "updated_at": "2022-04-29T15:19:38Z",
      "type": null,
      "subject": "My subject line",
      "raw_subject": "My subject line ",
      "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris et nulla ut sapien ultrices tempus.",
      "priority": "normal",
      "status": "new",
      "recipient": "somebody@email.com",
      "requester_id": 1234567891,
      "submitter_id": 1234567891,
      "assignee_id": null,
      "organization_id": null,
      "group_id": 123456789,
      "collaborator_ids": [],
      "follower_ids": [],
      "email_cc_ids": [],
      "forum_topic_id": null,
      "problem_id": null,
      "has_incidents": false,
      "is_public": true,
      "due_at": null,
      "tags": [
        "_tag_1",
        "_tag_2",
        "_tag_3"
      ],
      "custom_fields": [
        {
          "id": 1500010396161,
          "value": null
        },
        {
          "id": 360009431333,
          "value": "Keep this data"
        },
        {
          "id": 360054304553,
          "value": false
        },
        {
          "id": 1900000317745,
          "value": null
        },
        {
          "id": 360002223154,
          "value": null
        },
        {
          "id": 360009431353,
          "value": "Keep this too"
        },
        {
          "id": 1500001920482,
          "value": "Keep this data, as well!"
        }
      ],
      "followup_ids": [],
      "ticket_form_id": 12345678912,
      "brand_id": 112358,
      "allow_channelback": false,
      "allow_attachments": true,
      "result_type": "ticket"
    },
    {
      "url": "https://anotherresult.com",
      "id": 5192741,
      "external_id": null,
      "via": {
        "channel": "web",
        "source": {
          "from": {},
          "to": {},
          "rel": null
        }
      },
      "created_at": "2022-04-18T15:19:37Z",
      "updated_at": "2022-04-18T15:19:38Z",
      "type": null,
      "subject": "My other subject line",
      "raw_subject": "My other subject line ",
      "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris et nulla ut sapien ultrices tempus.",
      "priority": "normal",
      "status": "new",
      "recipient": "somebody@email.com",
      "requester_id": 1234567892,
      "submitter_id": 1234567892,
      "assignee_id": null,
      "organization_id": null,
      "group_id": 123456780,
      "collaborator_ids": [],
      "follower_ids": [],
      "email_cc_ids": [],
      "forum_topic_id": null,
      "problem_id": null,
      "has_incidents": false,
      "is_public": true,
      "due_at": null,
      "tags": [
        "_tag_1",
        "_tag_2",
        "_tag_3"
      ],
      "custom_fields": [
        {
          "id": 1500010396161,
          "value": null
        },
        {
          "id": 360009431333,
          "value": "Keep this data"
        },
        {
          "id": 360054304553,
          "value": false
        },
        {
          "id": 1900000317745,
          "value": null
        },
        {
          "id": 360002223154,
          "value": null
        },
        {
          "id": 360009431353,
          "value": "Keep this too"
        },
        {
          "id": 1500001920482,
          "value": "Keep this data, as well!"
        }
      ],
      "followup_ids": [],
      "ticket_form_id": 12345678913,
      "brand_id": 112359,
      "allow_channelback": false,
      "allow_attachments": true,
      "result_type": "ticket"
    }
  ],
  "facets": null,
  "meta": {
    "has_more": true,
    "after_cursor": "eyJmaWVsZCI6ImNyZWF0ZWRfYXQiLCJkZXNjIjp0cnVlLCJ0aWVCcmVha0ZpZWxkIjoiaWQiLCJ0aWVCcmVha0Rlc2MiOmZhbHNlLCJzb3J0VmFsdWVzIjpbMTY0NjQxNTc3MjAwMCwxNTA5NDY0NjMzNTYyXSwiZXhwb3J0ZWRUaHVzRmFyIjoxMDAwLCJzZXNzaW9uU3RhcnQiOjE2NTE1MTA1MDE3MDksImNyZWF0ZWRBdCI6MTY1MTUxMDUwMTgxNywic2FsdGVkUmVxdWVzdEhhc2giOjEwMTMwNTk0MjMsInNhbHRlZEN1cnNvckhhc2giOi0xMTE3Mzc0MjIxfQ==",
    "before_cursor": null
  },
  "links": {
    "prev": null,
    "next": "https://myendpoint.site.com/api/v2/search/export.json?filter%5Btype%5D=ticket&page%5Bafter%5D=eyJmaWVsZCI6ImNyZWF0ZWRfYXQiLCJkZXNjIjp0cnVlLCJ0aWVCcmVha0ZpZWxkIjoiaWQiLCJ0aWVCcmVha0Rlc2MiOmZhbHNlLCJzb3J0VmFsdWVzIjpbMTY0NjQxNTc3MjAwMCwxNTA5NDY0NjMzNTYyXSwiZXhwb3J0ZWRUaHVzRmFyIjoxMDAwLCJzZXNzaW9uU3RhcnQiOjE2NTE1MTA1MDE3MDksImNyZWF0ZWRBdCI6MTY1MTUxMDUwMTgxNywic2FsdGVkUmVxdWVzdEhhc2giOjEwMTMwNTk0MjMsInNhbHRlZEN1cnNvckhhc2giOi0xMTE3Mzc0MjIxfQ%3D%3D&page%5Bsize%5D=1000&query=group%3A360000609273+created%3E6Months"
  }
}

我想 trim 归结为:

{
    "results": [
        {
            "url": "https://someresult.com",
            "created_at": "2022-04-29T15:19:37Z",
            "subject": "My subject line",
            "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris et nulla ut sapien ultrices tempus.",
            "recipient": "somebody@email.com",
            "tags": [
                "_tag_1",
                "_tag_2",
                "_tag_3"
            ],
            "os": "Keep this data",
            "is_signed_in": false,
            "phone_model": "Keep this too",
            "channel": "Keep this data, as well!"
        }
    ]
}

到目前为止,我已经能够通过将字段全部键入来删除字段来暴力破解它

jq 'del(.results[] | .url, .id, .external_id, .via, .updated_at, .type, .raw_subject, .priority, .status, .requester_id, .submitter_id, .assignee_id, .organization_id, .group_id, .collaborator_ids, .follower_ids, .email_cc_ids, .problem_id, .has_incidents, .is_public, .due_at, .forum_topic_id, .satisfaction_rating, .sharing_agreement_ids, .fields, .followup_ids, .ticket_form_id, .allow_channelback, .allow_attachments, .result_type)'

但这感觉很荒谬(仍然有效)。 但是当我尝试从 custom_fields 数组中删除或过滤我想要的字段时,我卡住了 (Cannot index array with string "custom fields").

我的问题是双重的:

  1. 有没有更简洁的方法可以让我选择保留哪些字段而不是指定删除哪些字段?
  2. 如何从嵌套数组中抓取我需要的字段,并将它们展平到与其余字段相同的级别,同时重命名它们?

以下说明了回答双重问题的一种方法:

.results |= map(
  (.custom_fields | map(.value | select(. != null))) as $values
  | {url, created_at, subject, description, recipient, tags, 
     os: $values[0],
     is_signed_in: $values[1],
     phone_model: $values[2],
     channel: $values[3] } )

然而,这会导致 .results 的长度为 2。

您可以使用映射来跟踪所需的自定义字段:

!/usr/bin/env bash

jq '{"os"          : 360009431333,
     "is_signed_in": 360054304553,
     "phone_model" : 360009431353,
     "channel"     : 1500001920482} as $mapping |
.results |= map(
   { url,created_at,subject,description,recipient,tags,
   } +
   (.custom_fields as $custom_fields |
    $mapping |
    with_entries(
      .value |= (. as $id | $custom_fields[]?|select(.id==$id).value)
    )
   )
)' input.json