如何使用 BigQuery 提取 JSON 对象中的所有键
How to extract all the keys in a JSON object with BigQuery
BigQuery 具有在实时交互式查询中解析 JSON 的功能:只需将 JSON 编码的对象存储为字符串,并实时查询,具有 JSON_EXTRACT_SCALAR 等功能.
但是,我找不到发现这些对象中所有键(属性)的方法。
我可以为此使用 UDF 吗?
如何在 BigQuery 中使用 JavaScript UDF 提取所有 JSON 对象键:
SELECT type, key
FROM (
SELECT * FROM
js(
(SELECT json, type FROM [fh-bigquery:openlibrary.ol_dump_20151231]
),
// Input columns.
json, type,
// Output schema.
"[{name: 'key', type:'string'},
{name: 'type', type:'string'}]",
// The function.
"function(r, emit) {
x=JSON.parse(r.json)
Object.keys(x).forEach(function(entry) {
emit({key:entry, type:r.type,});
});
}"
)
)
LIMIT 100
分组计数:
找到所有可以使用的键后,您就可以在正常的 SQL 查询中使用 JSON_EXTRACT_SCALAR:
现在您知道了键,您可以提取某个类型的所有已知信息:
SELECT JSON_EXTRACT_SCALAR(json, '$.key') key,
JSON_EXTRACT_SCALAR(json, '$.type.key') type,
JSON_EXTRACT(json, '$.revision') revision,
JSON_EXTRACT_SCALAR(json, '$.last_modified.value') last_modified,
JSON_EXTRACT_SCALAR(json, '$.title') title,
JSON_EXTRACT_SCALAR(json, '$.publish_date') publish_date,
JSON_EXTRACT(json, '$.publishers') publishers,
JSON_EXTRACT(json, '$.latest_revision') latest_revision,
JSON_EXTRACT(json, '$.languages') languages,
JSON_EXTRACT(json, '$.authors') authors,
JSON_EXTRACT(json, '$.works') works,
JSON_EXTRACT(json, '$.number_of_pages') number_of_pages,
JSON_EXTRACT(json, '$.publish_places') publish_places,
JSON_EXTRACT(json, '$.publish_country') publish_country,
JSON_EXTRACT(json, '$.subjects') subjects,
JSON_EXTRACT_SCALAR(json, '$.created.value') created,
JSON_EXTRACT_SCALAR(json, '$.pagination') pagination,
JSON_EXTRACT_SCALAR(json, '$.by_statement') by_statement,
JSON_EXTRACT(json, '$.isbn_10') isbn_10,
JSON_EXTRACT_SCALAR(json, '$.isbn_10[0]') isbn_10_0,
JSON_EXTRACT(json, '$.notes') notes,
JSON_EXTRACT(json, '$.lc_classifications') lc_classifications,
JSON_EXTRACT_SCALAR(json, '$.subtitle') subtitle,
JSON_EXTRACT(json, '$.lccn') lccn,
JSON_EXTRACT(json, '$.identifiers') identifiers,
JSON_EXTRACT(json, '$.contributions') contributions,
JSON_EXTRACT(json, '$.isbn_13') isbn_13,
JSON_EXTRACT_SCALAR(json, '$.isbn_13[0]') isbn_13_0,
JSON_EXTRACT(json, '$.physical_format') physical_format,
JSON_EXTRACT(json, '$.oclc_numbers') oclc_numbers,
JSON_EXTRACT(json, '$.series') series,
JSON_EXTRACT(json, '$.source_records') source_records,
JSON_EXTRACT(json, '$.covers') covers,
JSON_EXTRACT(json, '$.dewey_decimal_class') dewey_decimal_class,
JSON_EXTRACT_SCALAR(json, '$.edition_name') edition_name,
# ...
FROM [fh-bigquery:openlibrary.ol_dump_20151231]
WHERE type='/type/edition'
LIMIT 10
(示例数据取自 Open Library 数据转储 https://openlibrary.org/developers/dumps, based on a reddit conversation)
以下版本修复了原始答案中的一些 "issues",例如:
1. 只发出了一级键
2. 必须手动编译 运行 最终查询以根据发现的键
提取信息
SELECT type, key, value, COUNT(1) AS weight
FROM JS(
(SELECT json, type
FROM [fh-bigquery:openlibrary.ol_dump_20151231@0]
WHERE type = '/type/edition'
),
json, type, // Input columns
"[{name: 'type', type:'string'}, // Output schema
{name: 'key', type:'string'},
{name: 'value', type:'string'}]",
"function(r, emit) { // The function
x = JSON.parse(r.json);
processKey(x, '');
function processKey(node, parent) {
if (parent !== '') {parent += '.'};
Object.keys(node).map(function(key) {
value = node[key].toString();
if (value !== '[object Object]') {
emit({type:r.type, key:parent + key, value:value});
} else {
processKey(node[key], parent + key);
};
});
};
}"
)
GROUP EACH BY type, key, value
ORDER BY weight DESC
LIMIT 1000
结果如下
Row type key value weight
1 /type/edition type.key /type/edition 25140209
2 /type/edition last_modified.type /type/datetime 25140209
3 /type/edition created.type /type/datetime 17092292
4 /type/edition languages.0.key /languages/eng 14514830
5 /type/edition notes.type /type/text 11681480
6 /type/edition revision 2 8714084
7 /type/edition latest_revision 2 8704217
8 /type/edition revision 3 5041680
9 /type/edition latest_revision 3 5040634
10 /type/edition created.value 2008-04-01T03:28:50.625462 3579095
11 /type/edition revision 1 3396868
12 /type/edition physical_format Paperback 3181270
13 /type/edition revision 4 3053266
14 /type/edition latest_revision 4 3053197
15 /type/edition revision 5 2076094
16 /type/edition latest_revision 5 2076072
17 /type/edition publish_country nyu 1727347
18 /type/edition created.value 2008-04-30T09:38:13.731961 1681227
19 /type/edition publish_country enk 1627969
20 /type/edition publish_places London 1613755
21 /type/edition physical_format Hardcover 1495864
22 /type/edition publish_places New York 1467779
23 /type/edition revision 6 1437467
24 /type/edition latest_revision 6 1437463
25 /type/edition publish_country xxk 1407624
这就是我想出的(特别是针对 StandardSQL)。不确定在列表中累积是否是最好的方法...另外..我简化了我只关心键的情况。
CREATE TEMPORARY FUNCTION Foo(infoo STRING)
RETURNS Array<String>
LANGUAGE js AS """
blah = [];
function processKey(node, parent) {
if (parent !== '') {parent += '.'};
Object.keys(node).forEach(function(key) {
value = node[key].toString();
if (value !== '[object Object]') {
blah.push(parent+key)
} else {
processKey(node[key], parent + key);
};
});
};
try {
x = JSON.parse(infoo);
processKey(x,'');
return blah;
} catch (e) { return null }
"""
OPTIONS ();
WITH x as(
select Foo(jsonfield) as bbb from clickstream.clikcs
)
select distinct arr_item from (SELECT arr_item FROM x, UNNEST(bbb) as arr_item)
这是使用 Standard SQL 的东西:
CREATE TEMP FUNCTION jsonObjectKeys(input STRING)
RETURNS Array<String>
LANGUAGE js AS """
return Object.keys(JSON.parse(input));
""";
WITH keys AS (
SELECT
jsonObjectKeys(myColumn) AS keys
FROM
myProject.myTable
WHERE myColumn IS NOT NULL
)
SELECT
DISTINCT k
FROM keys
CROSS JOIN UNNEST(keys.keys) AS k
ORDER BY k
以上答案在当前 (2021) 版本中效果不佳,如果 JSON 字段为空或 JSON 有空条目,则失败,聚合不好(我们试图获得结构,而不是内容),等等。
所以,这是一个基于Felipe Hoffa's 的改进版本。
它是完全递归的;检查 null
和 Array
类型;抑制数组索引(如 []
);标记为确定性的,因此它会被缓存;并对结果进行分组、排序和计数。
示例输出:
key type n
"" null 213
avatar string 1046
blinking boolean 1046
created_at string 1046
deprecated_fields Array 1046
display_name string 1046
fields Array 1046
fields.[] Object 31
fields.[].name string 31
fields.[].value string 31
fields.[].verified_at null 27
fields.[].verified_at string 4
friends_count number 1046
注:
- 空字符串键意味着该字段本身实际上是空的
-
deprecated_fields
键是 JSON 中的所有示例都是 ..., deprecated_fields: [], ...
null
作为字符串 "null"
返回,与其他类型一样(不是 SQL null)
可以改进它来检测不同类型的数字(int、bigint、float、decimal)、日期、存储为字符串的数字等。但是,嗯,这对我的目的来说已经足够了,而且需要更多的处理。
只需更改最后几行中的 your-*
位:
CREATE TEMP FUNCTION jsonParsed(input STRING)
RETURNS Array<Struct<key STRING, type STRING>>
DETERMINISTIC LANGUAGE js AS
"""
function processKey(node, parent) {
var ary = [];
if (parent !== '') {
parent += '.';
}
if (node == null) {
ary.push({
key: parent,
type: 'null'
})
} else {
Object.keys(node).map(function(key) {
var v = node[key];
if (node.constructor.name == "Array") {
keytouse = '[]'
} else {
keytouse = key
}
if ((v == null) || (typeof(v) !== 'object')) {
if (v == null) { typetouse = 'null';} else {typetouse = typeof(v);}
ary.push({
key: parent + keytouse,
type: typetouse
});
} else {
ary.push({
key: parent + keytouse,
type: v.constructor.name
});
ary = [].concat(ary, processKey(v, parent + keytouse));
}
});
}
return ary;
}
return processKey(JSON.parse(input), '');
""";
with keys as (SELECT jsonParsed(your-json-field) as keys FROM `your-project-id.your-database-id.your-table-id`)
select key, type, count(*) as n from keys k cross join unnest(k.keys) as kk group by key, type order by key asc;
BigQuery 具有在实时交互式查询中解析 JSON 的功能:只需将 JSON 编码的对象存储为字符串,并实时查询,具有 JSON_EXTRACT_SCALAR 等功能.
但是,我找不到发现这些对象中所有键(属性)的方法。
我可以为此使用 UDF 吗?
如何在 BigQuery 中使用 JavaScript UDF 提取所有 JSON 对象键:
SELECT type, key
FROM (
SELECT * FROM
js(
(SELECT json, type FROM [fh-bigquery:openlibrary.ol_dump_20151231]
),
// Input columns.
json, type,
// Output schema.
"[{name: 'key', type:'string'},
{name: 'type', type:'string'}]",
// The function.
"function(r, emit) {
x=JSON.parse(r.json)
Object.keys(x).forEach(function(entry) {
emit({key:entry, type:r.type,});
});
}"
)
)
LIMIT 100
分组计数:
找到所有可以使用的键后,您就可以在正常的 SQL 查询中使用 JSON_EXTRACT_SCALAR:
现在您知道了键,您可以提取某个类型的所有已知信息:
SELECT JSON_EXTRACT_SCALAR(json, '$.key') key,
JSON_EXTRACT_SCALAR(json, '$.type.key') type,
JSON_EXTRACT(json, '$.revision') revision,
JSON_EXTRACT_SCALAR(json, '$.last_modified.value') last_modified,
JSON_EXTRACT_SCALAR(json, '$.title') title,
JSON_EXTRACT_SCALAR(json, '$.publish_date') publish_date,
JSON_EXTRACT(json, '$.publishers') publishers,
JSON_EXTRACT(json, '$.latest_revision') latest_revision,
JSON_EXTRACT(json, '$.languages') languages,
JSON_EXTRACT(json, '$.authors') authors,
JSON_EXTRACT(json, '$.works') works,
JSON_EXTRACT(json, '$.number_of_pages') number_of_pages,
JSON_EXTRACT(json, '$.publish_places') publish_places,
JSON_EXTRACT(json, '$.publish_country') publish_country,
JSON_EXTRACT(json, '$.subjects') subjects,
JSON_EXTRACT_SCALAR(json, '$.created.value') created,
JSON_EXTRACT_SCALAR(json, '$.pagination') pagination,
JSON_EXTRACT_SCALAR(json, '$.by_statement') by_statement,
JSON_EXTRACT(json, '$.isbn_10') isbn_10,
JSON_EXTRACT_SCALAR(json, '$.isbn_10[0]') isbn_10_0,
JSON_EXTRACT(json, '$.notes') notes,
JSON_EXTRACT(json, '$.lc_classifications') lc_classifications,
JSON_EXTRACT_SCALAR(json, '$.subtitle') subtitle,
JSON_EXTRACT(json, '$.lccn') lccn,
JSON_EXTRACT(json, '$.identifiers') identifiers,
JSON_EXTRACT(json, '$.contributions') contributions,
JSON_EXTRACT(json, '$.isbn_13') isbn_13,
JSON_EXTRACT_SCALAR(json, '$.isbn_13[0]') isbn_13_0,
JSON_EXTRACT(json, '$.physical_format') physical_format,
JSON_EXTRACT(json, '$.oclc_numbers') oclc_numbers,
JSON_EXTRACT(json, '$.series') series,
JSON_EXTRACT(json, '$.source_records') source_records,
JSON_EXTRACT(json, '$.covers') covers,
JSON_EXTRACT(json, '$.dewey_decimal_class') dewey_decimal_class,
JSON_EXTRACT_SCALAR(json, '$.edition_name') edition_name,
# ...
FROM [fh-bigquery:openlibrary.ol_dump_20151231]
WHERE type='/type/edition'
LIMIT 10
(示例数据取自 Open Library 数据转储 https://openlibrary.org/developers/dumps, based on a reddit conversation)
以下版本修复了原始答案中的一些 "issues",例如:
1. 只发出了一级键
2. 必须手动编译 运行 最终查询以根据发现的键
SELECT type, key, value, COUNT(1) AS weight
FROM JS(
(SELECT json, type
FROM [fh-bigquery:openlibrary.ol_dump_20151231@0]
WHERE type = '/type/edition'
),
json, type, // Input columns
"[{name: 'type', type:'string'}, // Output schema
{name: 'key', type:'string'},
{name: 'value', type:'string'}]",
"function(r, emit) { // The function
x = JSON.parse(r.json);
processKey(x, '');
function processKey(node, parent) {
if (parent !== '') {parent += '.'};
Object.keys(node).map(function(key) {
value = node[key].toString();
if (value !== '[object Object]') {
emit({type:r.type, key:parent + key, value:value});
} else {
processKey(node[key], parent + key);
};
});
};
}"
)
GROUP EACH BY type, key, value
ORDER BY weight DESC
LIMIT 1000
结果如下
Row type key value weight
1 /type/edition type.key /type/edition 25140209
2 /type/edition last_modified.type /type/datetime 25140209
3 /type/edition created.type /type/datetime 17092292
4 /type/edition languages.0.key /languages/eng 14514830
5 /type/edition notes.type /type/text 11681480
6 /type/edition revision 2 8714084
7 /type/edition latest_revision 2 8704217
8 /type/edition revision 3 5041680
9 /type/edition latest_revision 3 5040634
10 /type/edition created.value 2008-04-01T03:28:50.625462 3579095
11 /type/edition revision 1 3396868
12 /type/edition physical_format Paperback 3181270
13 /type/edition revision 4 3053266
14 /type/edition latest_revision 4 3053197
15 /type/edition revision 5 2076094
16 /type/edition latest_revision 5 2076072
17 /type/edition publish_country nyu 1727347
18 /type/edition created.value 2008-04-30T09:38:13.731961 1681227
19 /type/edition publish_country enk 1627969
20 /type/edition publish_places London 1613755
21 /type/edition physical_format Hardcover 1495864
22 /type/edition publish_places New York 1467779
23 /type/edition revision 6 1437467
24 /type/edition latest_revision 6 1437463
25 /type/edition publish_country xxk 1407624
这就是我想出的(特别是针对 StandardSQL)。不确定在列表中累积是否是最好的方法...另外..我简化了我只关心键的情况。
CREATE TEMPORARY FUNCTION Foo(infoo STRING)
RETURNS Array<String>
LANGUAGE js AS """
blah = [];
function processKey(node, parent) {
if (parent !== '') {parent += '.'};
Object.keys(node).forEach(function(key) {
value = node[key].toString();
if (value !== '[object Object]') {
blah.push(parent+key)
} else {
processKey(node[key], parent + key);
};
});
};
try {
x = JSON.parse(infoo);
processKey(x,'');
return blah;
} catch (e) { return null }
"""
OPTIONS ();
WITH x as(
select Foo(jsonfield) as bbb from clickstream.clikcs
)
select distinct arr_item from (SELECT arr_item FROM x, UNNEST(bbb) as arr_item)
这是使用 Standard SQL 的东西:
CREATE TEMP FUNCTION jsonObjectKeys(input STRING)
RETURNS Array<String>
LANGUAGE js AS """
return Object.keys(JSON.parse(input));
""";
WITH keys AS (
SELECT
jsonObjectKeys(myColumn) AS keys
FROM
myProject.myTable
WHERE myColumn IS NOT NULL
)
SELECT
DISTINCT k
FROM keys
CROSS JOIN UNNEST(keys.keys) AS k
ORDER BY k
以上答案在当前 (2021) 版本中效果不佳,如果 JSON 字段为空或 JSON 有空条目,则失败,聚合不好(我们试图获得结构,而不是内容),等等。
所以,这是一个基于Felipe Hoffa's
它是完全递归的;检查 null
和 Array
类型;抑制数组索引(如 []
);标记为确定性的,因此它会被缓存;并对结果进行分组、排序和计数。
示例输出:
key type n
"" null 213
avatar string 1046
blinking boolean 1046
created_at string 1046
deprecated_fields Array 1046
display_name string 1046
fields Array 1046
fields.[] Object 31
fields.[].name string 31
fields.[].value string 31
fields.[].verified_at null 27
fields.[].verified_at string 4
friends_count number 1046
注:
- 空字符串键意味着该字段本身实际上是空的
-
deprecated_fields
键是 JSON 中的所有示例都是..., deprecated_fields: [], ...
null
作为字符串"null"
返回,与其他类型一样(不是 SQL null)
可以改进它来检测不同类型的数字(int、bigint、float、decimal)、日期、存储为字符串的数字等。但是,嗯,这对我的目的来说已经足够了,而且需要更多的处理。
只需更改最后几行中的 your-*
位:
CREATE TEMP FUNCTION jsonParsed(input STRING)
RETURNS Array<Struct<key STRING, type STRING>>
DETERMINISTIC LANGUAGE js AS
"""
function processKey(node, parent) {
var ary = [];
if (parent !== '') {
parent += '.';
}
if (node == null) {
ary.push({
key: parent,
type: 'null'
})
} else {
Object.keys(node).map(function(key) {
var v = node[key];
if (node.constructor.name == "Array") {
keytouse = '[]'
} else {
keytouse = key
}
if ((v == null) || (typeof(v) !== 'object')) {
if (v == null) { typetouse = 'null';} else {typetouse = typeof(v);}
ary.push({
key: parent + keytouse,
type: typetouse
});
} else {
ary.push({
key: parent + keytouse,
type: v.constructor.name
});
ary = [].concat(ary, processKey(v, parent + keytouse));
}
});
}
return ary;
}
return processKey(JSON.parse(input), '');
""";
with keys as (SELECT jsonParsed(your-json-field) as keys FROM `your-project-id.your-database-id.your-table-id`)
select key, type, count(*) as n from keys k cross join unnest(k.keys) as kk group by key, type order by key asc;