在 BigQuery 中,如何分别对每列的值进行排序
In BigQuery, how to sort values separately for each column
是否可以在 bigquery 的 table 中对每一列单独排序?我们有一个 table 的统计数据,我们想对分区内的每一列分别进行排序。我们有 3 个统计数据 stat1
、stat2
、stat3
并且 gender
是我们的分区键:
with
stats as (
select 'male' as gender, .60 as stat1, 23 as stat2, .10 as stat3 union all
select 'male' as gender, .62 as stat1, 28 as stat2, .12 as stat3 union all
select 'male' as gender, .57 as stat1, 21 as stat2, .16 as stat3 union all
select 'male' as gender, .51 as stat1, 18 as stat2, .14 as stat3 union all
select 'male' as gender, .53 as stat1, 17 as stat2, .18 as stat3 union all
select 'male' as gender, .46 as stat1, 31 as stat2, .08 as stat3 union all
select 'male' as gender, .49 as stat1, 32 as stat2, .07 as stat3 union all
select 'male' as gender, .55 as stat1, 40 as stat2, .23 as stat3 union all
select 'male' as gender, .68 as stat1, 41 as stat2, .33 as stat3 union all
select 'male' as gender, .56 as stat1, 36 as stat2, .32 as stat3 union all
select 'female' as gender, .80 as stat1, 32 as stat2, .42 as stat3 union all
select 'female' as gender, .82 as stat1, 24 as stat2, .43 as stat3 union all
select 'female' as gender, .73 as stat1, 26 as stat2, .33 as stat3 union all
select 'female' as gender, .85 as stat1, 27 as stat2, .55 as stat3 union all
select 'female' as gender, .91 as stat1, 29 as stat2, .53 as stat3 union all
select 'female' as gender, .88 as stat1, 13 as stat2, .51 as stat3 union all
select 'female' as gender, .86 as stat1, 38 as stat2, .49 as stat3 union all
select 'female' as gender, .77 as stat1, 35 as stat2, .40 as stat3 union all
select 'female' as gender, .74 as stat1, 15 as stat2, .58 as stat3 union all
select 'female' as gender, .95 as stat1, 17 as stat2, .59 as stat3
),
-- we create rank columns
stats_with_ranks as (
select
*
,row_number() over (partition by gender) as rank
,rank() over (partition by gender order by stat1 desc) as stat1rk
,rank() over (partition by gender order by stat2 desc) as stat2rk
,rank() over (partition by gender order by stat3 desc) as stat3rk
from stats
)
select * from stats_with_ranks where gender = 'female' order by rank asc
这不是我们问题的正确 table。我们希望每个指标都在其自己的列中排序,以便其值对应于单个 rank
列。例如,stat1
、stat2
、stat3
中每一个的最大值将与 rank = 1
位于同一行。像这样:
我们想出了以下方法:
select
a.gender
,a.rank
,b.stat1 as stat1
,c.stat2 as stat2
,d.stat3 as stat3
from stats_with_ranks as a
left join stats_with_ranks as b on a.gender = b.gender and a.rank = b.stat1rk
left join stats_with_ranks as c on a.gender = c.gender and a.rank = c.stat2rk
left join stats_with_ranks as d on a.gender = d.gender and a.rank = d.stat3rk
order by gender asc, rank asc
但是 这种方法无法扩展到我们更大的数据集,其中我们正在排名的 table 中有 200 多个统计数据。当有 200 多个这样的左连接时,我们很快就会得到错误 Resources exceeded during query execution: Not enough resources for query planning - too many subqueries or query is too complex.
。这就是为什么我们正在寻找一种在列内排序的解决方案。
以下答案假设所有统计列都具有相同的数据类型并且命名约定为 statN
execute immediate (select '''
select * from (
select *, row_number() over(partition by gender, col order by stat desc) rank
from stats
unpivot (stat for col in (''' || col_list || '''))
)
pivot (any_value(stat) for col in (''' || val_list || '''))
order by gender, rank
'''
from (
select
string_agg('"stat' || pos || '"') val_list,
string_agg('stat' || pos) col_list
from unnest(generate_array(1,3)) pos
)
)
如果应用于您问题中的样本数据(所有 float64 值)
We don't mind listing out all 200+ stats so long as the query resources aren't exceeded ...
create temp function extract_keys(input string) returns array<string> language js as "return Object.keys(JSON.parse(input));";
create temp function extract_values(input string) returns array<string> language js as "return Object.values(JSON.parse(input));";
select * from (
select gender, col, cast(stat as float64) stat, row_number() over(partition by gender, col order by cast(stat as float64) desc) rank
from stats t,
unnest([struct(to_json_string((select as struct * except(gender) from unnest([t]))) as json)]),
unnest(extract_keys(json)) col with offset
join unnest(extract_values(json)) stat with offset
using(offset)
)
pivot (any_value(stat) for col in ('stat1','stat2','stat3'))
如果应用于您问题中的示例数据 - 输出为
是否可以在 bigquery 的 table 中对每一列单独排序?我们有一个 table 的统计数据,我们想对分区内的每一列分别进行排序。我们有 3 个统计数据 stat1
、stat2
、stat3
并且 gender
是我们的分区键:
with
stats as (
select 'male' as gender, .60 as stat1, 23 as stat2, .10 as stat3 union all
select 'male' as gender, .62 as stat1, 28 as stat2, .12 as stat3 union all
select 'male' as gender, .57 as stat1, 21 as stat2, .16 as stat3 union all
select 'male' as gender, .51 as stat1, 18 as stat2, .14 as stat3 union all
select 'male' as gender, .53 as stat1, 17 as stat2, .18 as stat3 union all
select 'male' as gender, .46 as stat1, 31 as stat2, .08 as stat3 union all
select 'male' as gender, .49 as stat1, 32 as stat2, .07 as stat3 union all
select 'male' as gender, .55 as stat1, 40 as stat2, .23 as stat3 union all
select 'male' as gender, .68 as stat1, 41 as stat2, .33 as stat3 union all
select 'male' as gender, .56 as stat1, 36 as stat2, .32 as stat3 union all
select 'female' as gender, .80 as stat1, 32 as stat2, .42 as stat3 union all
select 'female' as gender, .82 as stat1, 24 as stat2, .43 as stat3 union all
select 'female' as gender, .73 as stat1, 26 as stat2, .33 as stat3 union all
select 'female' as gender, .85 as stat1, 27 as stat2, .55 as stat3 union all
select 'female' as gender, .91 as stat1, 29 as stat2, .53 as stat3 union all
select 'female' as gender, .88 as stat1, 13 as stat2, .51 as stat3 union all
select 'female' as gender, .86 as stat1, 38 as stat2, .49 as stat3 union all
select 'female' as gender, .77 as stat1, 35 as stat2, .40 as stat3 union all
select 'female' as gender, .74 as stat1, 15 as stat2, .58 as stat3 union all
select 'female' as gender, .95 as stat1, 17 as stat2, .59 as stat3
),
-- we create rank columns
stats_with_ranks as (
select
*
,row_number() over (partition by gender) as rank
,rank() over (partition by gender order by stat1 desc) as stat1rk
,rank() over (partition by gender order by stat2 desc) as stat2rk
,rank() over (partition by gender order by stat3 desc) as stat3rk
from stats
)
select * from stats_with_ranks where gender = 'female' order by rank asc
这不是我们问题的正确 table。我们希望每个指标都在其自己的列中排序,以便其值对应于单个 rank
列。例如,stat1
、stat2
、stat3
中每一个的最大值将与 rank = 1
位于同一行。像这样:
我们想出了以下方法:
select
a.gender
,a.rank
,b.stat1 as stat1
,c.stat2 as stat2
,d.stat3 as stat3
from stats_with_ranks as a
left join stats_with_ranks as b on a.gender = b.gender and a.rank = b.stat1rk
left join stats_with_ranks as c on a.gender = c.gender and a.rank = c.stat2rk
left join stats_with_ranks as d on a.gender = d.gender and a.rank = d.stat3rk
order by gender asc, rank asc
但是 这种方法无法扩展到我们更大的数据集,其中我们正在排名的 table 中有 200 多个统计数据。当有 200 多个这样的左连接时,我们很快就会得到错误 Resources exceeded during query execution: Not enough resources for query planning - too many subqueries or query is too complex.
。这就是为什么我们正在寻找一种在列内排序的解决方案。
以下答案假设所有统计列都具有相同的数据类型并且命名约定为 statN
execute immediate (select '''
select * from (
select *, row_number() over(partition by gender, col order by stat desc) rank
from stats
unpivot (stat for col in (''' || col_list || '''))
)
pivot (any_value(stat) for col in (''' || val_list || '''))
order by gender, rank
'''
from (
select
string_agg('"stat' || pos || '"') val_list,
string_agg('stat' || pos) col_list
from unnest(generate_array(1,3)) pos
)
)
如果应用于您问题中的样本数据(所有 float64 值)
We don't mind listing out all 200+ stats so long as the query resources aren't exceeded ...
create temp function extract_keys(input string) returns array<string> language js as "return Object.keys(JSON.parse(input));";
create temp function extract_values(input string) returns array<string> language js as "return Object.values(JSON.parse(input));";
select * from (
select gender, col, cast(stat as float64) stat, row_number() over(partition by gender, col order by cast(stat as float64) desc) rank
from stats t,
unnest([struct(to_json_string((select as struct * except(gender) from unnest([t]))) as json)]),
unnest(extract_keys(json)) col with offset
join unnest(extract_values(json)) stat with offset
using(offset)
)
pivot (any_value(stat) for col in ('stat1','stat2','stat3'))
如果应用于您问题中的示例数据 - 输出为