如何解析雪花中的 json 以根据 json 中的特定字段获取计数
How to parse json in snowflake to get the count based on certain field in json
[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]
我在 snowflake 中有一列具有上述 json 值。我试图获取特定类型的计数。例如,我想要 type
= ENTITLEMENT
的计数。这应该是 2.
我可以拉平这些数据。但这会为列产生大量重复数据(我有一个包含大量列的 table)。
正在寻找解析此 json 的方法。
我尝试过的东西。
- 写一个javaudf来解析其中的内容
例子
REATE OR REPLACE FUNCTION IDN_DATA.entitlement_counter(access array)
RETURNS NUMBER
LANGUAGE java handler = 'JsonCounterWithFilter.entitlement_counter'
as
$$
public class JsonCounterWithFilter {
public int entitlement_counter(String[] access) {
int counter = 0;
for(String acc :access) {
if(acc.contains("ENTITLEMENT")) {
counter++;
}
}
return counter;
}
}
$$;
这不起作用,因为它会在任何地方查找单词 entitlement
,而不仅仅是在字段 type
中。我没有访问杰克逊库来解析这个 json.
一个简短的 JS UDF 可以做到这一点:
create or replace function count_object_in_array(A array, T string, V string)
returns string
language javascript
as
$$
return A.reduce((count, x) => count + (x[T] == V?1:0), 0)
$$;
例如:
create or replace temp table stst as
select parse_json('[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]') a;
select *, count_object_in_array(a, 'type', 'ENTITLEMENT')
from stst;
这是一个纯粹的 SQL 方法:
create or replace temp table T1 as
select parse_json($$
[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]
$$) COL1;
select VALUE:type::string as TYPE
,count(*) as CT
from t1, table(flatten(t1.col1))
group by TYPE
横向展平不应产生重复(至少不会基于您的输入)。您可以在本机 json 命令中进行聚合。看看这是否适用于您的数据
WITH JSON_DATA AS(
SELECT
PARSE_JSON('[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]' ) as json)
SELECT value:type::string as type, count(*) FROM JSON_DATA, lateral flatten(input => json)
GROUP BY type;
结果:
TYPE
COUNT(*)
ENTITLEMENT
2
ROLE
1
ACCESS_PROFILE
1
因此,如果我们拥有比单个演示行“更多”的数据:
with data as (
select column1 as id, parse_json(column2) as json from values
('a', '[ {"type":"ENTITLEMENT"},{"type":"ENTITLEMENT"},{"type":"ROLE"},{"type":"ACCESS_PROFILE"}]'),
('b', '[ {"type":"ENTITLEMENT"},{"type":"ROLE"},{"type":"ACCESS_PROFILE"}]'),
('c', '[ {"type":"ENTITLEMENT"},{"type":"ENTITLEMENT"},{"type":"ENTITLEMENT"},{"type":"ROLE"},{"type":"ACCESS_PROFILE"}]')
)
然后将其展平,就好像只是一大块代码,您将“混合”值:
SELECT
j.value:type::string as type, count(*)
FROM data as d, lateral flatten(input => d.json) j
GROUP BY type;
TYPE
COUNT(*)
ENTITLEMENT
6
ROLE
3
ACCESS_PROFILE
3
经典的答案是在 ID
等行中使用一些“额外”信息来分组:
SELECT
d.id,
j.value:type::string as type,
count(*) as count
FROM data as d, lateral flatten(input => d.json) j
GROUP BY 1,2;
ID
TYPE
COUNT
a
ENTITLEMENT
2
a
ROLE
1
a
ACCESS_PROFILE
1
b
ENTITLEMENT
1
b
ACCESS_PROFILE
1
c
ENTITLEMENT
3
c
ROLE
1
c
ACCESS_PROFILE
1
b
ROLE
1
如果您没有像 ID
这样的每行值,或者每行不是唯一的,则另一种方法是 FLATTEN
上的 SEQ
元素
SELECT
j.SEQ,
j.value:type::string as type,
count(*) as count
FROM data as d, lateral flatten(input => d.json) j
GROUP BY 1,2;
SEQ
TYPE
COUNT
1
ENTITLEMENT
2
1
ROLE
1
1
ACCESS_PROFILE
1
2
ENTITLEMENT
1
2
ROLE
1
2
ACCESS_PROFILE
1
3
ROLE
1
3
ACCESS_PROFILE
1
3
ENTITLEMENT
3
或者您可以将 SEQ 作为 select 删除,然后将其放在 GROUP BY 中,但这可能会让人们在“为什么我们有重复值”时感到困惑
SELECT
j.value:type::string as type,
count(*) as count
FROM data as d, lateral flatten(input => d.json) j
GROUP BY j.seq, 1;
TYPE
COUNT
ENTITLEMENT
2
ROLE
1
ACCESS_PROFILE
1
ENTITLEMENT
1
ROLE
1
ACCESS_PROFILE
1
ROLE
1
ACCESS_PROFILE
1
ENTITLEMENT
3
[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]
我在 snowflake 中有一列具有上述 json 值。我试图获取特定类型的计数。例如,我想要 type
= ENTITLEMENT
的计数。这应该是 2.
我可以拉平这些数据。但这会为列产生大量重复数据(我有一个包含大量列的 table)。
正在寻找解析此 json 的方法。
我尝试过的东西。
- 写一个javaudf来解析其中的内容 例子
REATE OR REPLACE FUNCTION IDN_DATA.entitlement_counter(access array)
RETURNS NUMBER
LANGUAGE java handler = 'JsonCounterWithFilter.entitlement_counter'
as
$$
public class JsonCounterWithFilter {
public int entitlement_counter(String[] access) {
int counter = 0;
for(String acc :access) {
if(acc.contains("ENTITLEMENT")) {
counter++;
}
}
return counter;
}
}
$$;
这不起作用,因为它会在任何地方查找单词 entitlement
,而不仅仅是在字段 type
中。我没有访问杰克逊库来解析这个 json.
一个简短的 JS UDF 可以做到这一点:
create or replace function count_object_in_array(A array, T string, V string)
returns string
language javascript
as
$$
return A.reduce((count, x) => count + (x[T] == V?1:0), 0)
$$;
例如:
create or replace temp table stst as
select parse_json('[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]') a;
select *, count_object_in_array(a, 'type', 'ENTITLEMENT')
from stst;
这是一个纯粹的 SQL 方法:
create or replace temp table T1 as
select parse_json($$
[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]
$$) COL1;
select VALUE:type::string as TYPE
,count(*) as CT
from t1, table(flatten(t1.col1))
group by TYPE
横向展平不应产生重复(至少不会基于您的输入)。您可以在本机 json 命令中进行聚合。看看这是否适用于您的数据
WITH JSON_DATA AS(
SELECT
PARSE_JSON('[
{
"type": "ENTITLEMENT",
},
{
"type": "ENTITLEMENT",
},
{
"type": "ROLE"
},
{
"type": "ACCESS_PROFILE"
}
]' ) as json)
SELECT value:type::string as type, count(*) FROM JSON_DATA, lateral flatten(input => json)
GROUP BY type;
结果:
TYPE | COUNT(*) |
---|---|
ENTITLEMENT | 2 |
ROLE | 1 |
ACCESS_PROFILE | 1 |
因此,如果我们拥有比单个演示行“更多”的数据:
with data as (
select column1 as id, parse_json(column2) as json from values
('a', '[ {"type":"ENTITLEMENT"},{"type":"ENTITLEMENT"},{"type":"ROLE"},{"type":"ACCESS_PROFILE"}]'),
('b', '[ {"type":"ENTITLEMENT"},{"type":"ROLE"},{"type":"ACCESS_PROFILE"}]'),
('c', '[ {"type":"ENTITLEMENT"},{"type":"ENTITLEMENT"},{"type":"ENTITLEMENT"},{"type":"ROLE"},{"type":"ACCESS_PROFILE"}]')
)
然后将其展平,就好像只是一大块代码,您将“混合”值:
SELECT
j.value:type::string as type, count(*)
FROM data as d, lateral flatten(input => d.json) j
GROUP BY type;
TYPE | COUNT(*) |
---|---|
ENTITLEMENT | 6 |
ROLE | 3 |
ACCESS_PROFILE | 3 |
经典的答案是在 ID
等行中使用一些“额外”信息来分组:
SELECT
d.id,
j.value:type::string as type,
count(*) as count
FROM data as d, lateral flatten(input => d.json) j
GROUP BY 1,2;
ID | TYPE | COUNT |
---|---|---|
a | ENTITLEMENT | 2 |
a | ROLE | 1 |
a | ACCESS_PROFILE | 1 |
b | ENTITLEMENT | 1 |
b | ACCESS_PROFILE | 1 |
c | ENTITLEMENT | 3 |
c | ROLE | 1 |
c | ACCESS_PROFILE | 1 |
b | ROLE | 1 |
如果您没有像 ID
这样的每行值,或者每行不是唯一的,则另一种方法是 FLATTEN
SEQ
元素
SELECT
j.SEQ,
j.value:type::string as type,
count(*) as count
FROM data as d, lateral flatten(input => d.json) j
GROUP BY 1,2;
SEQ | TYPE | COUNT |
---|---|---|
1 | ENTITLEMENT | 2 |
1 | ROLE | 1 |
1 | ACCESS_PROFILE | 1 |
2 | ENTITLEMENT | 1 |
2 | ROLE | 1 |
2 | ACCESS_PROFILE | 1 |
3 | ROLE | 1 |
3 | ACCESS_PROFILE | 1 |
3 | ENTITLEMENT | 3 |
或者您可以将 SEQ 作为 select 删除,然后将其放在 GROUP BY 中,但这可能会让人们在“为什么我们有重复值”时感到困惑
SELECT
j.value:type::string as type,
count(*) as count
FROM data as d, lateral flatten(input => d.json) j
GROUP BY j.seq, 1;
TYPE | COUNT |
---|---|
ENTITLEMENT | 2 |
ROLE | 1 |
ACCESS_PROFILE | 1 |
ENTITLEMENT | 1 |
ROLE | 1 |
ACCESS_PROFILE | 1 |
ROLE | 1 |
ACCESS_PROFILE | 1 |
ENTITLEMENT | 3 |