SQL 嵌套数据 运行 总数,直到达到值
SQL nested data running total until value reached
我正在尝试使用此处的教程在 Google BigQuery 上使用复杂的嵌套数据集:
https://developers.google.com/web/tools/chrome-user-experience-report/getting-started
导出为 json 的数据如下所示:
https://gist.github.com/kmturley/c46eb3898d6ee62871f4402a4d2c6f7d
我正在使用的示例是:
https://bigquery.cloud.google.com/savedquery/226352634162:c5b7ee9ea0394728a821cf60f58737c2
#standardSQL
SELECT
effective_connection_type.name AS ect,
SUM((
SELECT
SUM(bin.density)
FROM
UNNEST(first_contentful_paint.histogram.bin) bin
WHERE
bin.end <= 1000
AND origin = 'http://example.com')) AS density
FROM
`chrome-ux-report.chrome_ux_report.201710`
GROUP BY
ect
ORDER BY
density DESC
这输出数据:
1 4G 0.6977
2 3G 0.056
3 slow-2G null
4 2G null
5 offline null
然而,当 SUM(bin.density) 达到 0.7 时,我想输出 bin.end 值(时间)。我的预期输出看起来像这样:
1 4G 1000
2 3G 50000
3 slow-2G null
4 2G null
5 offline null
意思是在 4G 连接上,70% (0.7) 的页面加载时间少于 1.5 秒 (1500)。我试图将脚本修改为:
SELECT
SUM(bin.density)
WHERE
SUM(bin.density) <= 0.7
但这是不允许的,所以尝试了:
SELECT
SUM(bin.density) AS RunningTotal
WHERE
RunningTotal <= 0.7
也尝试过
SELECT
SUM(bin.density) OVER() AS RunningTotal
WHERE
RunningTotal <= 0.7
但这也不行!如何使用嵌套数据集实现 运行 总数?并让它输出 bin.end 时间?
如果我无法让嵌套数据集与 SQL 运行 合计一起使用,那么我唯一的其他选择是展平数据集,并使用 Python 遍历每一行来计算结果。性能要差得多!
更新:解决方案基于 Felipe Hoffa 的回答
#standardSQL
SELECT origin, form, ect, `end`, density
FROM (
SELECT origin, form, ect, `end`, density, ROW_NUMBER() OVER(PARTITION BY ect ORDER BY `end` DESC) rn
FROM (
SELECT origin, form, ect, bin.end, SUM(bin.density) OVER(PARTITION BY ect ORDER BY `end`) AS density
FROM (
SELECT origin, form_factor.name form, effective_connection_type.name ect, first_contentful_paint.histogram
FROM `chrome-ux-report.chrome_ux_report.201710`
WHERE origin = 'http://example.com' AND form_factor.name = 'phone'
) , UNNEST(histogram.bin) AS bin
)
WHERE density < 0.7
)
WHERE rn=1
https://bigquery.cloud.google.com/savedquery/88263730615:ba1906e86b074511a804660ec973de37
我想你想要一个子查询和一个累加和:
with cte as (
<your query here>
)
select x.*
from (select cte.*, sum(density) over (order by density desc) as runningtotal
from cte
) x
where runningtotal - density < 0.7 and runningtotal >= 0.7;
使用累积 SUM() 并按连接对结果进行排序:
#standardSQL
SELECT ect, `end`, density
FROM (
SELECT ect, `end`, density, ROW_NUMBER() OVER(PARTITION BY ect ORDER BY `end` DESC) rn
FROM (
SELECT ect, bin.end, SUM(bin.density) OVER(PARTITION BY ect ORDER BY `end`) AS density
FROM (
SELECT effective_connection_type.name ect, first_contentful_paint.histogram
FROM `chrome-ux-report.chrome_ux_report.201710`
WHERE origin = 'http://example.com'
) , UNNEST(histogram.bin) AS bin
)
WHERE density < 0.7
)
WHERE rn=1
我正在尝试使用此处的教程在 Google BigQuery 上使用复杂的嵌套数据集: https://developers.google.com/web/tools/chrome-user-experience-report/getting-started
导出为 json 的数据如下所示: https://gist.github.com/kmturley/c46eb3898d6ee62871f4402a4d2c6f7d
我正在使用的示例是: https://bigquery.cloud.google.com/savedquery/226352634162:c5b7ee9ea0394728a821cf60f58737c2
#standardSQL
SELECT
effective_connection_type.name AS ect,
SUM((
SELECT
SUM(bin.density)
FROM
UNNEST(first_contentful_paint.histogram.bin) bin
WHERE
bin.end <= 1000
AND origin = 'http://example.com')) AS density
FROM
`chrome-ux-report.chrome_ux_report.201710`
GROUP BY
ect
ORDER BY
density DESC
这输出数据:
1 4G 0.6977
2 3G 0.056
3 slow-2G null
4 2G null
5 offline null
然而,当 SUM(bin.density) 达到 0.7 时,我想输出 bin.end 值(时间)。我的预期输出看起来像这样:
1 4G 1000
2 3G 50000
3 slow-2G null
4 2G null
5 offline null
意思是在 4G 连接上,70% (0.7) 的页面加载时间少于 1.5 秒 (1500)。我试图将脚本修改为:
SELECT
SUM(bin.density)
WHERE
SUM(bin.density) <= 0.7
但这是不允许的,所以尝试了:
SELECT
SUM(bin.density) AS RunningTotal
WHERE
RunningTotal <= 0.7
也尝试过
SELECT
SUM(bin.density) OVER() AS RunningTotal
WHERE
RunningTotal <= 0.7
但这也不行!如何使用嵌套数据集实现 运行 总数?并让它输出 bin.end 时间?
如果我无法让嵌套数据集与 SQL 运行 合计一起使用,那么我唯一的其他选择是展平数据集,并使用 Python 遍历每一行来计算结果。性能要差得多!
更新:解决方案基于 Felipe Hoffa 的回答
#standardSQL
SELECT origin, form, ect, `end`, density
FROM (
SELECT origin, form, ect, `end`, density, ROW_NUMBER() OVER(PARTITION BY ect ORDER BY `end` DESC) rn
FROM (
SELECT origin, form, ect, bin.end, SUM(bin.density) OVER(PARTITION BY ect ORDER BY `end`) AS density
FROM (
SELECT origin, form_factor.name form, effective_connection_type.name ect, first_contentful_paint.histogram
FROM `chrome-ux-report.chrome_ux_report.201710`
WHERE origin = 'http://example.com' AND form_factor.name = 'phone'
) , UNNEST(histogram.bin) AS bin
)
WHERE density < 0.7
)
WHERE rn=1
https://bigquery.cloud.google.com/savedquery/88263730615:ba1906e86b074511a804660ec973de37
我想你想要一个子查询和一个累加和:
with cte as (
<your query here>
)
select x.*
from (select cte.*, sum(density) over (order by density desc) as runningtotal
from cte
) x
where runningtotal - density < 0.7 and runningtotal >= 0.7;
使用累积 SUM() 并按连接对结果进行排序:
#standardSQL
SELECT ect, `end`, density
FROM (
SELECT ect, `end`, density, ROW_NUMBER() OVER(PARTITION BY ect ORDER BY `end` DESC) rn
FROM (
SELECT ect, bin.end, SUM(bin.density) OVER(PARTITION BY ect ORDER BY `end`) AS density
FROM (
SELECT effective_connection_type.name ect, first_contentful_paint.histogram
FROM `chrome-ux-report.chrome_ux_report.201710`
WHERE origin = 'http://example.com'
) , UNNEST(histogram.bin) AS bin
)
WHERE density < 0.7
)
WHERE rn=1