如何为 VARIANT 列 Snowflake SQL 中的数组对象获取 First_Value()、Last_Value() 和之前的日期操作
How to Get First_Value(), Last_Value() and previous Date action for an Array object inside a VARIANT column SnowflakeSQL
我在 table 'QWERTY' 中有一个 VARIANT 列调用 'REQUEST',它在 JSON 中包含一个 Array 对象,例如
{
"ID": "123123",
"workflowHistory": [
{
"id": "666",
"workflowType": "CCC",
"entityId": "123123",
"creator": {
"id": "503081",
"displayName": "AGENT2",
"email": "AGENT2@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:58:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "555",
"workflowType": "AAA",
"entityId": "123123",
"creator": {
"id": "503080",
"displayName": "AGENT1",
"email": "AGENT1@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:55:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "444",
"workflowType": "xyz",
"entityId": "123123",
"creator": {
"id": "503080",
"displayName": "AGENT1",
"email": "AGENT1@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:19:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "333",
"workflowType": "BBB",
"entityId": "123123",
"creator": {
"id": "503079",
"displayName": "AGENT0",
"email": "AGENT0@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:10:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "222",
"workflowType": "ZZZ",
"entityId": "123123",
"creator": {
"id": "503079",
"displayName": "AGENT0",
"email": "AGENT0@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:08:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
}
]
}
此外,'QWERTY' table 有 HAVERST_DATE 和 PK ARTICLE_ID(与 REQUEST:workflowHistory.ID 相同),我正在尝试获得一个输出包含以下列:
- ID
- Last createdDate for an AGENTn
- First createdDate for an AGENTn
- the previous createdDate that is made BY AGENTn-1
- the next createdDate that is made BY AGENTn+1
我想要这样的输出:
OUTPUT
为此,我正在构建如下查询:
与 WorkFlow_Parsed AS(
SELECT ARTICLE_ID,
HARVEST_DATE,
value:createdDate::timestamp_tz AS create_date,
value:creator:email AS email,
value:workflowType AS workflowType,
value:value AS value
FROM 'QWERTY', lateral flatten( input => REQUEST:workflowHistory )
),
lag_Agent_timing AS
(SELECT
WorkFlow_Parsed.ARTICLE_ID AS ARTICLE_ID,WorkFlow_Parsed.email,LAG(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS lag_date_value
FROM WorkFlow_Parsed),
lead_agent_timing AS
(SELECT
WorkFlow_Parsed.ARTICLE_ID AS ARTICLE_ID,WorkFlow_Parsed.email,LEAD(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS lead_date_value
FROM WorkFlow_Parsed)
SELECT
DISTINCT
WorkFlow_Parsed.ARTICLE_ID AS _ARTICLE_ID,
WorkFlow_Parsed.email AS _email,
last_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS last_date_value,
first_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS first_date_value,
MAX(lag_Agent_timing.lag_date_value),
MIN(lead_agent_timing.lead_date_value)
FROM WorkFlow_Parsed
JOIN lag_Agent_timing ON WorkFlow_Parsed.ARTICLE_ID=lag_Agent_timing.ARTICLE_ID AND lag_Agent_timing.email=WorkFlow_Parsed.email
JOIN lead_agent_timing ON WorkFlow_Parsed.ARTICLE_ID=lead_agent_timing.ARTICLE_ID AND lead_agent_timing.email=WorkFlow_Parsed.email
GROUP BY _ARTICLE_ID,_email
但我收到错误消息:“[SYS_VW.CREATE_DATE_1] 不是有效的表达式分组”`
我该如何解决?
[SYS_VW.CREATE_DATE_1] is not a valid group by expression
错误是由于您在最终的 SELECT
查询中使用了 GROUP BY
。它指出您在查询中作为非组列 referencing/using Workflow_Parsed.create_date
但它不是 GROUP BY _ARTICLE_ID, _email
表达式的一部分,即它与 [Workflow_Parsed.create_date] is not a valid group by expression
如果您稍微简化查询,您将收到。
Snowflake 不允许 aggregating over a window function expression and if you'd like to ,尝试将查询嵌套在诸如 SELECT cols, aggregate(cols) FROM (SELECT cols, window(cols)) GROUP BY cols
的结构中以将两者分开(即首先对所有行应用 window 函数,然后将它产生的整个结果)。
我不确定 window 函数在您的示例查询中尝试什么,因为我在其中的任何地方都没有看到代理的 n ± 1
关系,但是按照您描述的要求和示例输出包括在内,以下应该有效(它只使用标量子查询,没有 window 函数):
WITH workflows AS (
SELECT PARSE_JSON('{"ID":"123123","workflowHistory":[{"id":"666","workflowType":"CCC","entityId":"123123","creator":{"id":"503081","displayName":"AGENT2","email":"AGENT2@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:58:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"555","workflowType":"AAA","entityId":"123123","creator":{"id":"503080","displayName":"AGENT1","email":"AGENT1@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:55:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"444","workflowType":"xyz","entityId":"123123","creator":{"id":"503080","displayName":"AGENT1","email":"AGENT1@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:19:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"333","workflowType":"BBB","entityId":"123123","creator":{"id":"503079","displayName":"AGENT0","email":"AGENT0@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:10:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"222","workflowType":"ZZZ","entityId":"123123","creator":{"id":"503079","displayName":"AGENT0","email":"AGENT0@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:08:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"}]}') AS request
), workflow_rows AS (
SELECT
w.request:ID::varchar AS article_id,
lf.value:createdDate::timestamp_tz AS created_date,
lf.value:creator.id::integer AS creator_id,
lf.value:creator.email::varchar AS creator_email,
lf.value:workflowType::varchar AS workflow_type,
lf.value:value::varchar AS workflow_value
FROM workflows w, LATERAL FLATTEN(REQUEST:workflowHistory) lf
), article_workflow_creators AS (
SELECT DISTINCT
article_id,
creator_id,
creator_email
FROM workflow_rows
)
SELECT
awc.article_id,
awc.creator_id,
awc.creator_email,
(SELECT MAX(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id) AS last_date_value,
(SELECT MIN(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id) AS first_date_value,
(SELECT MAX(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id - 1) AS previous_date,
(SELECT MAX(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id + 1) AS next_date
FROM article_workflow_creators awc;
对于问题中包含的单个 JSON 行输入,这将产生:
+------------+------------+----------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
| ARTICLE_ID | CREATOR_ID | CREATOR_EMAIL | LAST_DATE_VALUE | FIRST_DATE_VALUE | PREVIOUS_DATE | NEXT_DATE |
|------------+------------+----------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------|
| 123123 | 503081 | AGENT2@SOMETHING.com | 2020-04-30 21:58:09.000 +0000 | 2020-04-30 21:58:09.000 +0000 | 2020-04-30 21:55:09.000 +0000 | NULL |
| 123123 | 503080 | AGENT1@SOMETHING.com | 2020-04-30 21:55:09.000 +0000 | 2020-04-30 21:19:09.000 +0000 | 2020-04-30 21:10:09.000 +0000 | 2020-04-30 21:58:09.000 +0000 |
| 123123 | 503079 | AGENT0@SOMETHING.com | 2020-04-30 21:10:09.000 +0000 | 2020-04-30 21:08:09.000 +0000 | NULL | 2020-04-30 21:55:09.000 +0000 |
+------------+------------+----------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
我分享了如何使用推荐语法的代码
WITH WorkFlow_Parsed AS(
SELECT ARTICLE_ID,
HARVEST_DATE,
value:createdDate::timestamp_tz AS create_date,
value:creator:email AS email,
value:workflowType AS workflowType,
value:value AS value
FROM 'QWERTY', lateral flatten( input => REQUEST:workflowHistory )
)
SELECT _ARTICLE_ID, _email, last_date_value,first_date_value,
MIN(lag_value),
MAX(lead_value)
FROM (
SELECT
DISTINCT
WorkFlow_Parsed.ARTICLE_ID AS _ARTICLE_ID,
WorkFlow_Parsed.email AS _email,
last_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS last_date_value,
first_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS first_date_value,
COALESCE(LAG(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date),'1900-01-01 00:00:00') AS lag_value,
COALESCE(LEAD(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date),'2100-01-01 00:00:00') AS lead_value
FROM WorkFlow_Parsed) GROUP BY _ARTICLE_ID,_email,last_date_value,first_date_value
我在 table 'QWERTY' 中有一个 VARIANT 列调用 'REQUEST',它在 JSON 中包含一个 Array 对象,例如
{
"ID": "123123",
"workflowHistory": [
{
"id": "666",
"workflowType": "CCC",
"entityId": "123123",
"creator": {
"id": "503081",
"displayName": "AGENT2",
"email": "AGENT2@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:58:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "555",
"workflowType": "AAA",
"entityId": "123123",
"creator": {
"id": "503080",
"displayName": "AGENT1",
"email": "AGENT1@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:55:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "444",
"workflowType": "xyz",
"entityId": "123123",
"creator": {
"id": "503080",
"displayName": "AGENT1",
"email": "AGENT1@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:19:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "333",
"workflowType": "BBB",
"entityId": "123123",
"creator": {
"id": "503079",
"displayName": "AGENT0",
"email": "AGENT0@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:10:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
},
{
"id": "222",
"workflowType": "ZZZ",
"entityId": "123123",
"creator": {
"id": "503079",
"displayName": "AGENT0",
"email": "AGENT0@SOMETHING.com",
"userAvatarUrl": "XXXXXXX"
},
"createdDate": "2020-04-30T21:08:09Z",
"deletor": null,
"deletedDate": null,
"clientId": "000000000",
"value": "00000000"
}
]
}
此外,'QWERTY' table 有 HAVERST_DATE 和 PK ARTICLE_ID(与 REQUEST:workflowHistory.ID 相同),我正在尝试获得一个输出包含以下列:
- ID
- Last createdDate for an AGENTn
- First createdDate for an AGENTn
- the previous createdDate that is made BY AGENTn-1
- the next createdDate that is made BY AGENTn+1
我想要这样的输出:
OUTPUT
为此,我正在构建如下查询:
与 WorkFlow_Parsed AS(
SELECT ARTICLE_ID,
HARVEST_DATE,
value:createdDate::timestamp_tz AS create_date,
value:creator:email AS email,
value:workflowType AS workflowType,
value:value AS value
FROM 'QWERTY', lateral flatten( input => REQUEST:workflowHistory )
),
lag_Agent_timing AS
(SELECT
WorkFlow_Parsed.ARTICLE_ID AS ARTICLE_ID,WorkFlow_Parsed.email,LAG(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS lag_date_value
FROM WorkFlow_Parsed),
lead_agent_timing AS
(SELECT
WorkFlow_Parsed.ARTICLE_ID AS ARTICLE_ID,WorkFlow_Parsed.email,LEAD(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS lead_date_value
FROM WorkFlow_Parsed)
SELECT
DISTINCT
WorkFlow_Parsed.ARTICLE_ID AS _ARTICLE_ID,
WorkFlow_Parsed.email AS _email,
last_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS last_date_value,
first_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS first_date_value,
MAX(lag_Agent_timing.lag_date_value),
MIN(lead_agent_timing.lead_date_value)
FROM WorkFlow_Parsed
JOIN lag_Agent_timing ON WorkFlow_Parsed.ARTICLE_ID=lag_Agent_timing.ARTICLE_ID AND lag_Agent_timing.email=WorkFlow_Parsed.email
JOIN lead_agent_timing ON WorkFlow_Parsed.ARTICLE_ID=lead_agent_timing.ARTICLE_ID AND lead_agent_timing.email=WorkFlow_Parsed.email
GROUP BY _ARTICLE_ID,_email
但我收到错误消息:“[SYS_VW.CREATE_DATE_1] 不是有效的表达式分组”`
我该如何解决?
[SYS_VW.CREATE_DATE_1] is not a valid group by expression
错误是由于您在最终的 SELECT
查询中使用了 GROUP BY
。它指出您在查询中作为非组列 referencing/using Workflow_Parsed.create_date
但它不是 GROUP BY _ARTICLE_ID, _email
表达式的一部分,即它与 [Workflow_Parsed.create_date] is not a valid group by expression
如果您稍微简化查询,您将收到。
Snowflake 不允许 aggregating over a window function expression and if you'd like to SELECT cols, aggregate(cols) FROM (SELECT cols, window(cols)) GROUP BY cols
的结构中以将两者分开(即首先对所有行应用 window 函数,然后将它产生的整个结果)。
我不确定 window 函数在您的示例查询中尝试什么,因为我在其中的任何地方都没有看到代理的 n ± 1
关系,但是按照您描述的要求和示例输出包括在内,以下应该有效(它只使用标量子查询,没有 window 函数):
WITH workflows AS (
SELECT PARSE_JSON('{"ID":"123123","workflowHistory":[{"id":"666","workflowType":"CCC","entityId":"123123","creator":{"id":"503081","displayName":"AGENT2","email":"AGENT2@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:58:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"555","workflowType":"AAA","entityId":"123123","creator":{"id":"503080","displayName":"AGENT1","email":"AGENT1@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:55:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"444","workflowType":"xyz","entityId":"123123","creator":{"id":"503080","displayName":"AGENT1","email":"AGENT1@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:19:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"333","workflowType":"BBB","entityId":"123123","creator":{"id":"503079","displayName":"AGENT0","email":"AGENT0@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:10:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"},{"id":"222","workflowType":"ZZZ","entityId":"123123","creator":{"id":"503079","displayName":"AGENT0","email":"AGENT0@SOMETHING.com","userAvatarUrl":"XXXXXXX"},"createdDate":"2020-04-30T21:08:09Z","deletor":null,"deletedDate":null,"clientId":"000000000","value":"00000000"}]}') AS request
), workflow_rows AS (
SELECT
w.request:ID::varchar AS article_id,
lf.value:createdDate::timestamp_tz AS created_date,
lf.value:creator.id::integer AS creator_id,
lf.value:creator.email::varchar AS creator_email,
lf.value:workflowType::varchar AS workflow_type,
lf.value:value::varchar AS workflow_value
FROM workflows w, LATERAL FLATTEN(REQUEST:workflowHistory) lf
), article_workflow_creators AS (
SELECT DISTINCT
article_id,
creator_id,
creator_email
FROM workflow_rows
)
SELECT
awc.article_id,
awc.creator_id,
awc.creator_email,
(SELECT MAX(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id) AS last_date_value,
(SELECT MIN(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id) AS first_date_value,
(SELECT MAX(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id - 1) AS previous_date,
(SELECT MAX(wr.created_date) FROM workflow_rows wr WHERE wr.article_id = awc.article_id AND wr.creator_id = awc.creator_id + 1) AS next_date
FROM article_workflow_creators awc;
对于问题中包含的单个 JSON 行输入,这将产生:
+------------+------------+----------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
| ARTICLE_ID | CREATOR_ID | CREATOR_EMAIL | LAST_DATE_VALUE | FIRST_DATE_VALUE | PREVIOUS_DATE | NEXT_DATE |
|------------+------------+----------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------|
| 123123 | 503081 | AGENT2@SOMETHING.com | 2020-04-30 21:58:09.000 +0000 | 2020-04-30 21:58:09.000 +0000 | 2020-04-30 21:55:09.000 +0000 | NULL |
| 123123 | 503080 | AGENT1@SOMETHING.com | 2020-04-30 21:55:09.000 +0000 | 2020-04-30 21:19:09.000 +0000 | 2020-04-30 21:10:09.000 +0000 | 2020-04-30 21:58:09.000 +0000 |
| 123123 | 503079 | AGENT0@SOMETHING.com | 2020-04-30 21:10:09.000 +0000 | 2020-04-30 21:08:09.000 +0000 | NULL | 2020-04-30 21:55:09.000 +0000 |
+------------+------------+----------------------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+
我分享了如何使用推荐语法的代码
WITH WorkFlow_Parsed AS(
SELECT ARTICLE_ID,
HARVEST_DATE,
value:createdDate::timestamp_tz AS create_date,
value:creator:email AS email,
value:workflowType AS workflowType,
value:value AS value
FROM 'QWERTY', lateral flatten( input => REQUEST:workflowHistory )
)
SELECT _ARTICLE_ID, _email, last_date_value,first_date_value,
MIN(lag_value),
MAX(lead_value)
FROM (
SELECT
DISTINCT
WorkFlow_Parsed.ARTICLE_ID AS _ARTICLE_ID,
WorkFlow_Parsed.email AS _email,
last_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS last_date_value,
first_value(WorkFlow_Parsed.create_date) over (partition by WorkFlow_Parsed.email,WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date) AS first_date_value,
COALESCE(LAG(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date),'1900-01-01 00:00:00') AS lag_value,
COALESCE(LEAD(WorkFlow_Parsed.create_date) IGNORE NULLS over (partition by WorkFlow_Parsed.ARTICLE_ID order by WorkFlow_Parsed.create_date),'2100-01-01 00:00:00') AS lead_value
FROM WorkFlow_Parsed) GROUP BY _ARTICLE_ID,_email,last_date_value,first_date_value