如何使用 BigQuery 提取 JSON 对象中的所有键

How to extract all the keys in a JSON object with BigQuery

BigQuery 具有在实时交互式查询中解析 JSON 的功能:只需将 JSON 编码的对象存储为字符串,并实时查询,具有 JSON_EXTRACT_SCALAR 等功能.

但是,我找不到发现这些对象中所有键(属性)的方法。

我可以为此使用 UDF 吗?

如何在 BigQuery 中使用 JavaScript UDF 提取所有 JSON 对象键:

SELECT type, key
FROM (
  SELECT * FROM
  js(
    (SELECT json, type FROM [fh-bigquery:openlibrary.ol_dump_20151231]
    ),
    // Input columns.
    json, type,
    // Output schema.
    "[{name: 'key', type:'string'},
     {name: 'type', type:'string'}]",
     // The function.
     "function(r, emit) { 
      x=JSON.parse(r.json)
      Object.keys(x).forEach(function(entry) {
        emit({key:entry, type:r.type,});
      });     
    }"
  )
)
LIMIT 100

分组计数:

找到所有可以使用的键后,您就可以在正常的 SQL 查询中使用 JSON_EXTRACT_SCALAR:

现在您知道了键,您可以提取某个类型的所有已知信息:

SELECT JSON_EXTRACT_SCALAR(json, '$.key') key,
  JSON_EXTRACT_SCALAR(json, '$.type.key') type,
  JSON_EXTRACT(json, '$.revision') revision,
  JSON_EXTRACT_SCALAR(json, '$.last_modified.value') last_modified,
  JSON_EXTRACT_SCALAR(json, '$.title') title,
  JSON_EXTRACT_SCALAR(json, '$.publish_date') publish_date,
  JSON_EXTRACT(json, '$.publishers') publishers,
  JSON_EXTRACT(json, '$.latest_revision') latest_revision,
  JSON_EXTRACT(json, '$.languages') languages,
  JSON_EXTRACT(json, '$.authors') authors,
  JSON_EXTRACT(json, '$.works') works,
  JSON_EXTRACT(json, '$.number_of_pages') number_of_pages,
  JSON_EXTRACT(json, '$.publish_places') publish_places,
  JSON_EXTRACT(json, '$.publish_country') publish_country,
  JSON_EXTRACT(json, '$.subjects') subjects,
  JSON_EXTRACT_SCALAR(json, '$.created.value') created,
  JSON_EXTRACT_SCALAR(json, '$.pagination') pagination,
  JSON_EXTRACT_SCALAR(json, '$.by_statement') by_statement,
  JSON_EXTRACT(json, '$.isbn_10') isbn_10,
  JSON_EXTRACT_SCALAR(json, '$.isbn_10[0]') isbn_10_0,
  JSON_EXTRACT(json, '$.notes') notes,
  JSON_EXTRACT(json, '$.lc_classifications') lc_classifications,
  JSON_EXTRACT_SCALAR(json, '$.subtitle') subtitle,
  JSON_EXTRACT(json, '$.lccn') lccn,
  JSON_EXTRACT(json, '$.identifiers') identifiers,
  JSON_EXTRACT(json, '$.contributions') contributions,
  JSON_EXTRACT(json, '$.isbn_13') isbn_13,
  JSON_EXTRACT_SCALAR(json, '$.isbn_13[0]') isbn_13_0,
  JSON_EXTRACT(json, '$.physical_format') physical_format,
  JSON_EXTRACT(json, '$.oclc_numbers') oclc_numbers,
  JSON_EXTRACT(json, '$.series') series,
  JSON_EXTRACT(json, '$.source_records') source_records,
  JSON_EXTRACT(json, '$.covers') covers,
  JSON_EXTRACT(json, '$.dewey_decimal_class') dewey_decimal_class,
  JSON_EXTRACT_SCALAR(json, '$.edition_name') edition_name,
  # ...
FROM [fh-bigquery:openlibrary.ol_dump_20151231]
WHERE type='/type/edition'
LIMIT 10

(示例数据取自 Open Library 数据转储 https://openlibrary.org/developers/dumps, based on a reddit conversation

以下版本修复了原始答案中的一些 "issues",例如:
1. 只发出了一级键
2. 必须手动编译 运行 最终查询以根据发现的键

提取信息
SELECT type, key, value, COUNT(1) AS weight 
FROM JS(
  (SELECT json, type 
     FROM [fh-bigquery:openlibrary.ol_dump_20151231@0] 
     WHERE type = '/type/edition'
  ),
  json, type,                             // Input columns
  "[{name: 'type', type:'string'},        // Output schema
   {name: 'key', type:'string'},
   {name: 'value', type:'string'}]",
   "function(r, emit) {                    // The function
      x = JSON.parse(r.json);
      processKey(x, '');
      function processKey(node, parent) {
        if (parent !== '') {parent += '.'};
        Object.keys(node).map(function(key) {
          value = node[key].toString();
          if (value !== '[object Object]') {
            emit({type:r.type, key:parent + key, value:value});
          } else {
            processKey(node[key], parent + key);
          };
        });         
      };
    }"
  )
GROUP EACH BY type, key, value
ORDER BY weight DESC
LIMIT 1000

结果如下

Row          type   key                 value                         weight     
1   /type/edition   type.key            /type/edition               25140209     
2   /type/edition   last_modified.type  /type/datetime              25140209     
3   /type/edition   created.type        /type/datetime              17092292     
4   /type/edition   languages.0.key     /languages/eng              14514830     
5   /type/edition   notes.type          /type/text                  11681480     
6   /type/edition   revision            2                            8714084     
7   /type/edition   latest_revision     2                            8704217     
8   /type/edition   revision            3                            5041680     
9   /type/edition   latest_revision     3                            5040634     
10  /type/edition   created.value       2008-04-01T03:28:50.625462   3579095     
11  /type/edition   revision            1                            3396868     
12  /type/edition   physical_format     Paperback                    3181270     
13  /type/edition   revision            4                            3053266     
14  /type/edition   latest_revision     4                            3053197     
15  /type/edition   revision            5                            2076094     
16  /type/edition   latest_revision     5                            2076072     
17  /type/edition   publish_country     nyu                          1727347     
18  /type/edition   created.value       2008-04-30T09:38:13.731961   1681227     
19  /type/edition   publish_country     enk                          1627969     
20  /type/edition   publish_places      London                       1613755     
21  /type/edition   physical_format     Hardcover                    1495864     
22  /type/edition   publish_places      New York                     1467779     
23  /type/edition   revision            6                            1437467     
24  /type/edition   latest_revision     6                            1437463     
25  /type/edition   publish_country     xxk                          1407624 

这就是我想出的(特别是针对 StandardSQL)。不确定在列表中累积是否是最好的方法...另外..我简化了我只关心键的情况。

CREATE TEMPORARY FUNCTION Foo(infoo STRING)
RETURNS Array<String>
LANGUAGE js AS """
      blah = [];


      function processKey(node, parent) {
        if (parent !== '') {parent += '.'};
        Object.keys(node).forEach(function(key) {
          value = node[key].toString();
          if (value !== '[object Object]') {
            blah.push(parent+key) 
          } else {
            processKey(node[key], parent + key);
          };
        });                 
      };

    try {     
      x = JSON.parse(infoo);  
      processKey(x,'');
      return blah;
    } catch (e) { return null }      

"""
OPTIONS ();
WITH x as(
select Foo(jsonfield) as bbb from clickstream.clikcs
)
select distinct arr_item from (SELECT arr_item FROM x, UNNEST(bbb) as arr_item) 

这是使用 Standard SQL 的东西:

CREATE TEMP FUNCTION jsonObjectKeys(input STRING)
RETURNS Array<String>
LANGUAGE js AS """
  return Object.keys(JSON.parse(input));
""";
WITH keys AS (
  SELECT
    jsonObjectKeys(myColumn) AS keys
  FROM
    myProject.myTable
  WHERE myColumn IS NOT NULL
)
SELECT
  DISTINCT k
FROM keys
CROSS JOIN UNNEST(keys.keys) AS k
ORDER BY k

以上答案在当前 (2021) 版本中效果不佳,如果 JSON 字段为空或 JSON 有空条目,则失败,聚合不好(我们试图获得结构,而不是内容),等等。

所以,这是一个基于Felipe Hoffa's 的改进版本。

它是完全递归的;检查 nullArray 类型;抑制数组索引(如 []);标记为确定性的,因此它会被缓存;并对结果进行分组、排序和计数。

示例输出:

key type    n
""  null    213
avatar  string  1046
blinking    boolean 1046
created_at  string  1046
deprecated_fields   Array   1046
display_name    string  1046
fields  Array   1046
fields.[]   Object  31
fields.[].name  string  31
fields.[].value string  31
fields.[].verified_at   null    27
fields.[].verified_at   string  4
friends_count   number  1046

注:

  1. 空字符串键意味着该字段本身实际上是空的
  2. deprecated_fields 键是 JSON 中的所有示例都是 ..., deprecated_fields: [], ...
  3. null 作为字符串 "null" 返回,与其他类型一样(不是 SQL null)

可以改进它来检测不同类型的数字(int、bigint、float、decimal)、日期、存储为字符串的数字等。但是,嗯,这对我的目的来说已经足够了,而且需要更多的处理。

只需更改最后几行中的 your-* 位:

CREATE TEMP FUNCTION jsonParsed(input STRING)
RETURNS Array<Struct<key STRING, type STRING>>
DETERMINISTIC LANGUAGE js AS 
"""
function processKey(node, parent) {
    var ary = [];
    if (parent !== '') {
        parent += '.';
    }
    if (node == null) {
        ary.push({
            key: parent,
            type: 'null'
        })
    } else {
        Object.keys(node).map(function(key) {
            var v = node[key];
            if (node.constructor.name == "Array") {
                keytouse = '[]'
            } else {
                keytouse = key
            }
            if ((v == null) || (typeof(v) !== 'object')) {
                if (v == null) { typetouse = 'null';} else {typetouse = typeof(v);}
                ary.push({
                    key: parent + keytouse,
                    type: typetouse
                });
            } else {
                ary.push({
                    key: parent + keytouse,
                    type: v.constructor.name
                });
                ary = [].concat(ary, processKey(v, parent + keytouse));
            }
        });
    }
    return ary;
}
return processKey(JSON.parse(input), '');
""";

with keys as (SELECT jsonParsed(your-json-field) as keys  FROM `your-project-id.your-database-id.your-table-id`)
select key, type, count(*) as n from keys k cross join unnest(k.keys) as kk group by key, type order by key asc;