如何取消嵌套存储为字符串的 bigquery 字段?
How to unnest bigquery field that is stored as a string?
我正在尝试取消嵌套字段,但我的查询有问题。
我的示例数据 table
'1234', '{ "id" : "123" , "items" : [ { "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }}] }'
数据集中有 2 个字段:row_id 和 parts,其中 parts 是一个包含列表项(类别)的字典对象,但 parts 的数据类型是字符串。我希望输出是每个类别的单独行。
这是我尝试过的方法,但我没有得到任何结果。
#standardSQL
with t as (
select "1234" as row_id, '{ "id" : "123" , "items" : [ { "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }}] }' as parts
)
select row_id, _categories
from t,
UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT(parts, '$.items'), r'"categories":"(.+?)"')) _categories
预期结果
id, _categories
1234, cat1
1234, cat2
1234, cat3
以下适用于 BigQuery 标准 SQL
#standardSQL
WITH t AS (
SELECT "1234" AS row_id, '{ "id" : "123" , "items" : [ { "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }}] }' AS parts
)
SELECT row_id, REPLACE(_categories, '"', '') _categories
FROM t, UNNEST(SPLIT(REGEXP_EXTRACT(
JSON_EXTRACT(parts, '$.items'),
r'"categories":\[(.+?)]'))
) _categories
并产生预期结果
Row row_id _categories
1 1234 cat1
2 1234 cat2
3 1234 cat3
Update
以上解决方案主要集中在修复提取中使用的正则表达式 - 但没有解决具有多个产品的更一般情况。下面的解决方案解决了这种更一般的情况
#standardSQL
WITH t AS (
SELECT "1234" AS row_id, '''{ "id" : "123" , "items" : [
{ "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }},
{ "quantity" : 2 , "product" : { "id" : "p2" , "categories" : [ "cat4","cat5","cat6"] }}
] }''' AS parts
)
SELECT row_id, REPLACE(category, '"', '') category
FROM t, UNNEST(REGEXP_EXTRACT_ALL(parts, r'"categories" : \[(.+?)]')) categories,
UNNEST(SPLIT(categories)) category
结果
Row row_id category
1 1234 cat1
2 1234 cat2
3 1234 cat3
4 1234 cat4
5 1234 cat5
6 1234 cat6
我正在尝试取消嵌套字段,但我的查询有问题。
我的示例数据 table
'1234', '{ "id" : "123" , "items" : [ { "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }}] }'
数据集中有 2 个字段:row_id 和 parts,其中 parts 是一个包含列表项(类别)的字典对象,但 parts 的数据类型是字符串。我希望输出是每个类别的单独行。
这是我尝试过的方法,但我没有得到任何结果。
#standardSQL
with t as (
select "1234" as row_id, '{ "id" : "123" , "items" : [ { "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }}] }' as parts
)
select row_id, _categories
from t,
UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT(parts, '$.items'), r'"categories":"(.+?)"')) _categories
预期结果
id, _categories
1234, cat1
1234, cat2
1234, cat3
以下适用于 BigQuery 标准 SQL
#standardSQL
WITH t AS (
SELECT "1234" AS row_id, '{ "id" : "123" , "items" : [ { "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }}] }' AS parts
)
SELECT row_id, REPLACE(_categories, '"', '') _categories
FROM t, UNNEST(SPLIT(REGEXP_EXTRACT(
JSON_EXTRACT(parts, '$.items'),
r'"categories":\[(.+?)]'))
) _categories
并产生预期结果
Row row_id _categories
1 1234 cat1
2 1234 cat2
3 1234 cat3
Update
以上解决方案主要集中在修复提取中使用的正则表达式 - 但没有解决具有多个产品的更一般情况。下面的解决方案解决了这种更一般的情况
#standardSQL
WITH t AS (
SELECT "1234" AS row_id, '''{ "id" : "123" , "items" : [
{ "quantity" : 1 , "product" : { "id" : "p1" , "categories" : [ "cat1","cat2","cat3"] }},
{ "quantity" : 2 , "product" : { "id" : "p2" , "categories" : [ "cat4","cat5","cat6"] }}
] }''' AS parts
)
SELECT row_id, REPLACE(category, '"', '') category
FROM t, UNNEST(REGEXP_EXTRACT_ALL(parts, r'"categories" : \[(.+?)]')) categories,
UNNEST(SPLIT(categories)) category
结果
Row row_id category
1 1234 cat1
2 1234 cat2
3 1234 cat3
4 1234 cat4
5 1234 cat5
6 1234 cat6