如何 select 来自 Clickhouse 的百分比数据?
How to select data with percentages from Clickhouse?
给出以下 table:
CREATE TABLE main
(
`job_id` UUID,
`request_time` DateTime,
`host_id` UInt8,
`status_code` LowCardinality(String),
)
ENGINE = MergeTree
ORDER BY request_time
SETTINGS index_granularity = 8192
我正在尝试按主机获取所有状态及其相对百分比。为此,我需要计算按主机和状态分组的结果,并将每个计算的字段除以其总主机字段数。
例如,此查询在 MySQL:
中有效
SELECT
main.host_id,
main.status_code,
COUNT() AS status_count,
COUNT() / sub.host_total * 100 AS percent
FROM
main
INNER JOIN (
SELECT host_id, COUNT() AS host_total
FROM main
GROUP BY host_id
) AS sub ON (sub.host_id = main.host_id)
GROUP BY
main.host_id,
main.status_code
但是 ClickHouse 抛出:
DB::Exception: Unknown identifier: host_total; there are columns: host_id, status_code, count(): While processing host_id, status_code, count() AS status_count, (count() / host_total) * 100 AS percent. (UNKNOWN_IDENTIFIER)
可能是因为 correlated (dependent) subqueries are not supported.
有人建议我使用 CTE,所以我尝试了这个:
WITH sub AS (
SELECT host_id, COUNT() AS host_total
FROM main
GROUP BY host_id
)
SELECT
main.host_id,
main.status_code,
COUNT() AS status_count,
COUNT() / (SELECT host_total FROM sub WHERE sub.host_id = main.host_id) * 100 AS percent
FROM
main
GROUP BY
main.host_id,
main.status_code
但还是没有运气:
DB::Exception: Missing columns: 'main.host_id' while processing query: 'SELECT host_total FROM sub WHERE host_id = main.host_id', required columns: 'host_total' 'host_id' 'main.host_id' 'host_total' 'host_id' 'main.host_id': While processing (SELECT host_total FROM sub WHERE sub.host_id = main.host_id) AS _subquery20: While processing count() / ((SELECT host_total FROM sub WHERE sub.host_id = main.host_id) AS _subquery20): While processing (count() / ((SELECT host_total FROM sub WHERE sub.host_id = main.host_id) AS _subquery20)) * 100 AS percent. (UNKNOWN_IDENTIFIER)
CH 引发错误。 https://github.com/ClickHouse/ClickHouse/issues/4567
host_total 应在 groupby 部分或聚合函数下
insert into main(request_time, host_id,status_code) values ( now(), 1, 200);
insert into main(request_time, host_id,status_code) values ( now(), 1, 500);
insert into main(request_time, host_id,status_code) values ( now(), 1, 200);
insert into main(request_time, host_id,status_code) values ( now(), 2, 500);
insert into main(request_time, host_id,status_code) values ( now(), 2, 200);
insert into main(request_time, host_id,status_code) values ( now(), 3, 500);
SELECT
main.host_id,
main.status_code,
COUNT() AS status_count,
round((COUNT() / any(sub.host_total)) * 100, 2) AS percent
FROM main
INNER JOIN
(
SELECT
host_id,
COUNT() AS host_total
FROM main
GROUP BY host_id
) AS sub ON sub.host_id = main.host_id
GROUP BY
main.host_id,
main.status_code
ORDER BY
main.host_id ASC,
main.status_code ASC
┌─host_id─┬─status_code─┬─status_count─┬─percent─┐
│ 1 │ 200 │ 2 │ 66.67 │
│ 1 │ 500 │ 1 │ 33.33 │
│ 2 │ 200 │ 1 │ 50 │
│ 2 │ 500 │ 1 │ 50 │
│ 3 │ 500 │ 1 │ 100 │
└─────────┴─────────────┴──────────────┴─────────┘
但是有更好的方法可以解决:
window 函数
SELECT
host_id,
status_code,
status_count,
round((status_count / host_total) * 100, 2) AS percent
FROM
(
SELECT
host_id,
status_code,
status_count,
sum(status_count) OVER (PARTITION BY host_id) AS host_total
FROM
(
SELECT
host_id,
status_code,
COUNT() AS status_count
FROM main
GROUP BY
host_id,
status_code
)
)
ORDER BY
host_id ASC,
status_code ASC
┌─host_id─┬─status_code─┬─status_count─┬─percent─┐
│ 1 │ 200 │ 2 │ 66.67 │
│ 1 │ 500 │ 1 │ 33.33 │
│ 2 │ 200 │ 1 │ 50 │
│ 2 │ 500 │ 1 │ 50 │
│ 3 │ 500 │ 1 │ 100 │
└─────────┴─────────────┴──────────────┴─────────┘
数组
SELECT
host_id,
status_code,
status_count,
round((status_count / host_total) * 100, 2) AS percent
FROM
(
SELECT
host_id,
sumMap([CAST(status_code, 'String')], [1]) AS ga,
count() AS host_total
FROM main
GROUP BY host_id
)
ARRAY JOIN
ga.1 AS status_code,
ga.2 AS status_count
┌─host_id─┬─status_code─┬─status_count─┬─percent─┐
│ 1 │ 200 │ 2 │ 66.67 │
│ 1 │ 500 │ 1 │ 33.33 │
│ 2 │ 200 │ 1 │ 50 │
│ 2 │ 500 │ 1 │ 50 │
│ 3 │ 500 │ 1 │ 100 │
└─────────┴─────────────┴──────────────┴─────────┘
给出以下 table:
CREATE TABLE main
(
`job_id` UUID,
`request_time` DateTime,
`host_id` UInt8,
`status_code` LowCardinality(String),
)
ENGINE = MergeTree
ORDER BY request_time
SETTINGS index_granularity = 8192
我正在尝试按主机获取所有状态及其相对百分比。为此,我需要计算按主机和状态分组的结果,并将每个计算的字段除以其总主机字段数。
例如,此查询在 MySQL:
中有效SELECT
main.host_id,
main.status_code,
COUNT() AS status_count,
COUNT() / sub.host_total * 100 AS percent
FROM
main
INNER JOIN (
SELECT host_id, COUNT() AS host_total
FROM main
GROUP BY host_id
) AS sub ON (sub.host_id = main.host_id)
GROUP BY
main.host_id,
main.status_code
但是 ClickHouse 抛出:
DB::Exception: Unknown identifier: host_total; there are columns: host_id, status_code, count(): While processing host_id, status_code, count() AS status_count, (count() / host_total) * 100 AS percent. (UNKNOWN_IDENTIFIER)
可能是因为 correlated (dependent) subqueries are not supported.
有人建议我使用 CTE,所以我尝试了这个:
WITH sub AS (
SELECT host_id, COUNT() AS host_total
FROM main
GROUP BY host_id
)
SELECT
main.host_id,
main.status_code,
COUNT() AS status_count,
COUNT() / (SELECT host_total FROM sub WHERE sub.host_id = main.host_id) * 100 AS percent
FROM
main
GROUP BY
main.host_id,
main.status_code
但还是没有运气:
DB::Exception: Missing columns: 'main.host_id' while processing query: 'SELECT host_total FROM sub WHERE host_id = main.host_id', required columns: 'host_total' 'host_id' 'main.host_id' 'host_total' 'host_id' 'main.host_id': While processing (SELECT host_total FROM sub WHERE sub.host_id = main.host_id) AS _subquery20: While processing count() / ((SELECT host_total FROM sub WHERE sub.host_id = main.host_id) AS _subquery20): While processing (count() / ((SELECT host_total FROM sub WHERE sub.host_id = main.host_id) AS _subquery20)) * 100 AS percent. (UNKNOWN_IDENTIFIER)
CH 引发错误。 https://github.com/ClickHouse/ClickHouse/issues/4567
host_total 应在 groupby 部分或聚合函数下
insert into main(request_time, host_id,status_code) values ( now(), 1, 200);
insert into main(request_time, host_id,status_code) values ( now(), 1, 500);
insert into main(request_time, host_id,status_code) values ( now(), 1, 200);
insert into main(request_time, host_id,status_code) values ( now(), 2, 500);
insert into main(request_time, host_id,status_code) values ( now(), 2, 200);
insert into main(request_time, host_id,status_code) values ( now(), 3, 500);
SELECT
main.host_id,
main.status_code,
COUNT() AS status_count,
round((COUNT() / any(sub.host_total)) * 100, 2) AS percent
FROM main
INNER JOIN
(
SELECT
host_id,
COUNT() AS host_total
FROM main
GROUP BY host_id
) AS sub ON sub.host_id = main.host_id
GROUP BY
main.host_id,
main.status_code
ORDER BY
main.host_id ASC,
main.status_code ASC
┌─host_id─┬─status_code─┬─status_count─┬─percent─┐
│ 1 │ 200 │ 2 │ 66.67 │
│ 1 │ 500 │ 1 │ 33.33 │
│ 2 │ 200 │ 1 │ 50 │
│ 2 │ 500 │ 1 │ 50 │
│ 3 │ 500 │ 1 │ 100 │
└─────────┴─────────────┴──────────────┴─────────┘
但是有更好的方法可以解决:
window 函数
SELECT
host_id,
status_code,
status_count,
round((status_count / host_total) * 100, 2) AS percent
FROM
(
SELECT
host_id,
status_code,
status_count,
sum(status_count) OVER (PARTITION BY host_id) AS host_total
FROM
(
SELECT
host_id,
status_code,
COUNT() AS status_count
FROM main
GROUP BY
host_id,
status_code
)
)
ORDER BY
host_id ASC,
status_code ASC
┌─host_id─┬─status_code─┬─status_count─┬─percent─┐
│ 1 │ 200 │ 2 │ 66.67 │
│ 1 │ 500 │ 1 │ 33.33 │
│ 2 │ 200 │ 1 │ 50 │
│ 2 │ 500 │ 1 │ 50 │
│ 3 │ 500 │ 1 │ 100 │
└─────────┴─────────────┴──────────────┴─────────┘
数组
SELECT
host_id,
status_code,
status_count,
round((status_count / host_total) * 100, 2) AS percent
FROM
(
SELECT
host_id,
sumMap([CAST(status_code, 'String')], [1]) AS ga,
count() AS host_total
FROM main
GROUP BY host_id
)
ARRAY JOIN
ga.1 AS status_code,
ga.2 AS status_count
┌─host_id─┬─status_code─┬─status_count─┬─percent─┐
│ 1 │ 200 │ 2 │ 66.67 │
│ 1 │ 500 │ 1 │ 33.33 │
│ 2 │ 200 │ 1 │ 50 │
│ 2 │ 500 │ 1 │ 50 │
│ 3 │ 500 │ 1 │ 100 │
└─────────┴─────────────┴──────────────┴─────────┘