为什么 Snowflake 在转换为展平列表时会更改 JSON 值的顺序?
why Snowflake changing the order of JSON values when converting into flatten list?
我有 JSON 个对象存储在 table 中,我正在尝试编写查询以从 JSON 中获取第一个元素。
复制脚本
create table staging.par.test_json (id int, val varchar(2000));
insert into staging.par.test_json values (1, '{"list":[{"element":"Plumber"},{"element":"Craft"},{"element":"Plumbing"},{"element":"Electrics"},{"element":"Electrical"},{"element":"Tradesperson"},{"element":"Home services"},{"element":"Housekeepings"},{"element":"Electrical Goods"}]}');
insert into staging.par.test_json values (2,'
{
"list": [
{
"element": "Wholesale jeweler"
},
{
"element": "Fashion"
},
{
"element": "Industry"
},
{
"element": "Jewelry store"
},
{
"element": "Business service"
},
{
"element": "Corporate office"
}
]
}');
with cte_get_cats AS
(
select id,
val as category_list
from staging.par.test_json
),
cats_parse AS
(
select id,
parse_json(category_list) as c
from cte_get_cats
),
distinct_cats as
(
select id,
INDEX,
UPPER(cast(value:element AS varchar)) As c
from
cats_parse,
LATERAL flatten(INPUT => c:"list")
order by 1,2
) ,
cat_array AS
(
SELECT
id,
array_agg(DISTINCT c) AS sds_categories
FROM
distinct_cats
GROUP BY 1
),
sds_cats AS
(
select id,
cast(sds_categories[0] AS varchar) as sds_primary_category
from cat_array
)
select * from sds_cats;
值:类别
{"list":[{"element":"Plumber"},{"element":"Craft"},{"element":"Plumbing"},{"element":"Electrics"},{"element":"Electrical"},{"element":"Tradesperson"},{"element":"Home services"},{"element":"Housekeepings"},{"element":"Electrical Goods"}]}
将其展平为列表给了我
["Plumber","Craft","Plumbing","Electrics","Electrical","Tradesperson","Home services","Housekeepings","Electrical Goods"]
问题:
这个顺序并不总是相同的。雪花似乎会改变顺序,有时雪花会根据字母表改变顺序。
我怎样才能使这个静态。我不想更改顺序。
问题是你使用的方式 ARRAY_AGG
:
array_agg(DISTINCT c) AS sds_categories
像这样指定它不会给 Snowflake 任何关于数组内容应该如何排列的指导方针。您应该 而不是 假设数组将按照与其输入记录相同的顺序创建——可能是这样,但不能保证。所以你可能想做
array_agg(DISTINCT c) within group (order by index) AS sds_categories
但是那是行不通的,就像你使用DISTINCT c
一样,每个c
的index
的值是未知的。也许你不需要DISTINCT
,那么这个就可以了
array_agg(c) within group (order by index) AS sds_categories
如果您确实需要 DISTINCT
,您需要以某种方式将 index
与不同的 c
值相关联。一种方法是在输入中的 index
上使用 MIN
函数。这是一个完整的查询
with cte_get_cats AS
(
select id,
val as category_list
from staging.par.test_json
),
cats_parse AS
(
select id,
parse_json(category_list) as c
from cte_get_cats
),
distinct_cats as
(
select id,
MIN(INDEX) AS index,
UPPER(cast(value:element AS varchar)) As c
from
cats_parse,
LATERAL flatten(INPUT => c:"list")
group by 1,3
) ,
cat_array AS
(
SELECT
id,
array_agg(c) within group (order by index) AS sds_categories
FROM
distinct_cats
GROUP BY 1
),
sds_cats AS
(
select id,
cast(sds_categories[0] AS varchar) as sds_primary_category
from cat_array
)
select * from cat_array;
我有 JSON 个对象存储在 table 中,我正在尝试编写查询以从 JSON 中获取第一个元素。
复制脚本
create table staging.par.test_json (id int, val varchar(2000));
insert into staging.par.test_json values (1, '{"list":[{"element":"Plumber"},{"element":"Craft"},{"element":"Plumbing"},{"element":"Electrics"},{"element":"Electrical"},{"element":"Tradesperson"},{"element":"Home services"},{"element":"Housekeepings"},{"element":"Electrical Goods"}]}');
insert into staging.par.test_json values (2,'
{
"list": [
{
"element": "Wholesale jeweler"
},
{
"element": "Fashion"
},
{
"element": "Industry"
},
{
"element": "Jewelry store"
},
{
"element": "Business service"
},
{
"element": "Corporate office"
}
]
}');
with cte_get_cats AS
(
select id,
val as category_list
from staging.par.test_json
),
cats_parse AS
(
select id,
parse_json(category_list) as c
from cte_get_cats
),
distinct_cats as
(
select id,
INDEX,
UPPER(cast(value:element AS varchar)) As c
from
cats_parse,
LATERAL flatten(INPUT => c:"list")
order by 1,2
) ,
cat_array AS
(
SELECT
id,
array_agg(DISTINCT c) AS sds_categories
FROM
distinct_cats
GROUP BY 1
),
sds_cats AS
(
select id,
cast(sds_categories[0] AS varchar) as sds_primary_category
from cat_array
)
select * from sds_cats;
值:类别
{"list":[{"element":"Plumber"},{"element":"Craft"},{"element":"Plumbing"},{"element":"Electrics"},{"element":"Electrical"},{"element":"Tradesperson"},{"element":"Home services"},{"element":"Housekeepings"},{"element":"Electrical Goods"}]}
将其展平为列表给了我
["Plumber","Craft","Plumbing","Electrics","Electrical","Tradesperson","Home services","Housekeepings","Electrical Goods"]
问题: 这个顺序并不总是相同的。雪花似乎会改变顺序,有时雪花会根据字母表改变顺序。 我怎样才能使这个静态。我不想更改顺序。
问题是你使用的方式 ARRAY_AGG
:
array_agg(DISTINCT c) AS sds_categories
像这样指定它不会给 Snowflake 任何关于数组内容应该如何排列的指导方针。您应该 而不是 假设数组将按照与其输入记录相同的顺序创建——可能是这样,但不能保证。所以你可能想做
array_agg(DISTINCT c) within group (order by index) AS sds_categories
但是那是行不通的,就像你使用DISTINCT c
一样,每个c
的index
的值是未知的。也许你不需要DISTINCT
,那么这个就可以了
array_agg(c) within group (order by index) AS sds_categories
如果您确实需要 DISTINCT
,您需要以某种方式将 index
与不同的 c
值相关联。一种方法是在输入中的 index
上使用 MIN
函数。这是一个完整的查询
with cte_get_cats AS
(
select id,
val as category_list
from staging.par.test_json
),
cats_parse AS
(
select id,
parse_json(category_list) as c
from cte_get_cats
),
distinct_cats as
(
select id,
MIN(INDEX) AS index,
UPPER(cast(value:element AS varchar)) As c
from
cats_parse,
LATERAL flatten(INPUT => c:"list")
group by 1,3
) ,
cat_array AS
(
SELECT
id,
array_agg(c) within group (order by index) AS sds_categories
FROM
distinct_cats
GROUP BY 1
),
sds_cats AS
(
select id,
cast(sds_categories[0] AS varchar) as sds_primary_category
from cat_array
)
select * from cat_array;