bigquery 中用于打包元素的桶数最少
Fewest number of buckets to bag elements in bigquery
我有一个包含桶和元素的矩阵,如下所示。如果一个元素可以放入桶中,则它在相应的单元格中为 1
例如:如果您查看图像,元素 x 可以放入 bucket-a,b,c 而不能放入 d 和 e
我想找到最少的桶来对我的元素进行分组。在这种情况下,桶 c 和 d 可以将所有元素分组到两个桶中。
知道我是否可以在 bigquery 中动态高效地执行此操作吗?原始数据没有这么简单
select "element-x" as element , 1 as bucketa, 1 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-y" as element , 0 as bucketa, 0 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-z" as element , 1 as bucketa, 0 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-p" as element , 0 as bucketa, 0 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-q" as element , 1 as bucketa, 0 as bucketb, 0 as bucketc, 1 as bucketd, 0 as buckete
union all
select "element-r" as element , 0 as bucketa, 1 as bucketb, 0 as bucketc, 1 as bucketd, 1 as buckete
考虑以下解决方案 - 显然您需要确保在 matrix
CTE 中提供准确的数据,并且您还需要分别调整 buckets_elements
CTE 以反映矩阵中的所有桶。其余的 CTE 和最终查询将为您解决!
with matrix as (
select "element-x" as element, 1 as bucketa, 1 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete union all
select "element-y", 0, 0, 1, 0, 0 union all
select "element-z", 1, 0, 1, 0, 0 union all
select "element-p", 0, 0, 1, 0, 0 union all
select "element-q", 1, 0, 0, 1, 0 union all
select "element-r", 0, 1, 0, 1, 1
), buckets_elements as (
select array[struct(a), struct(b), struct(c), struct(d), struct(e)] buckets
from (
select
array_agg(if(bucketa = 1, element, null) ignore nulls) a,
array_agg(if(bucketb = 1, element, null) ignore nulls) b,
array_agg(if(bucketc = 1, element, null) ignore nulls) c,
array_agg(if(bucketd = 1, element, null) ignore nulls) d,
array_agg(if(buckete = 1, element, null) ignore nulls) e
from matrix
)
), columns_names as (
select
regexp_extract_all(to_json_string((select as struct * except(element) from unnest([t]))), r'"([^"]+)"') cols
from matrix t limit 1
), columns_index as (
select generate_array(0, array_length(cols) - 1) as arr
from columns_names
), buckets_combinations as (
select
(select array_agg(
case when n & (1<<pos) <> 0 then arr[offset(pos)] end
ignore nulls)
from unnest(generate_array(0, array_length(arr) - 1)) pos
) as combo
from columns_index cross join
unnest(generate_array(1, cast(power(2, array_length(arr)) - 1 as int64))) n
)
select
array(select cols[offset(i)] from columns_names, unnest(combo) i) winners
from (
select combo,
rank() over(order by (select count(distinct el) from unnest(val) v, unnest(v.a) el) desc, array_length(combo)) as rnk
from (
select any_value(c).combo, array_agg(buckets[offset(i)]) val
from buckets_combinations c, unnest(combo) i, buckets_elements b
group by format('%t', c)
)
)
where rnk = 1
有输出
我有一个包含桶和元素的矩阵,如下所示。如果一个元素可以放入桶中,则它在相应的单元格中为 1
例如:如果您查看图像,元素 x 可以放入 bucket-a,b,c 而不能放入 d 和 e
我想找到最少的桶来对我的元素进行分组。在这种情况下,桶 c 和 d 可以将所有元素分组到两个桶中。
知道我是否可以在 bigquery 中动态高效地执行此操作吗?原始数据没有这么简单
select "element-x" as element , 1 as bucketa, 1 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-y" as element , 0 as bucketa, 0 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-z" as element , 1 as bucketa, 0 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-p" as element , 0 as bucketa, 0 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete
union all
select "element-q" as element , 1 as bucketa, 0 as bucketb, 0 as bucketc, 1 as bucketd, 0 as buckete
union all
select "element-r" as element , 0 as bucketa, 1 as bucketb, 0 as bucketc, 1 as bucketd, 1 as buckete
考虑以下解决方案 - 显然您需要确保在 matrix
CTE 中提供准确的数据,并且您还需要分别调整 buckets_elements
CTE 以反映矩阵中的所有桶。其余的 CTE 和最终查询将为您解决!
with matrix as (
select "element-x" as element, 1 as bucketa, 1 as bucketb, 1 as bucketc, 0 as bucketd, 0 as buckete union all
select "element-y", 0, 0, 1, 0, 0 union all
select "element-z", 1, 0, 1, 0, 0 union all
select "element-p", 0, 0, 1, 0, 0 union all
select "element-q", 1, 0, 0, 1, 0 union all
select "element-r", 0, 1, 0, 1, 1
), buckets_elements as (
select array[struct(a), struct(b), struct(c), struct(d), struct(e)] buckets
from (
select
array_agg(if(bucketa = 1, element, null) ignore nulls) a,
array_agg(if(bucketb = 1, element, null) ignore nulls) b,
array_agg(if(bucketc = 1, element, null) ignore nulls) c,
array_agg(if(bucketd = 1, element, null) ignore nulls) d,
array_agg(if(buckete = 1, element, null) ignore nulls) e
from matrix
)
), columns_names as (
select
regexp_extract_all(to_json_string((select as struct * except(element) from unnest([t]))), r'"([^"]+)"') cols
from matrix t limit 1
), columns_index as (
select generate_array(0, array_length(cols) - 1) as arr
from columns_names
), buckets_combinations as (
select
(select array_agg(
case when n & (1<<pos) <> 0 then arr[offset(pos)] end
ignore nulls)
from unnest(generate_array(0, array_length(arr) - 1)) pos
) as combo
from columns_index cross join
unnest(generate_array(1, cast(power(2, array_length(arr)) - 1 as int64))) n
)
select
array(select cols[offset(i)] from columns_names, unnest(combo) i) winners
from (
select combo,
rank() over(order by (select count(distinct el) from unnest(val) v, unnest(v.a) el) desc, array_length(combo)) as rnk
from (
select any_value(c).combo, array_agg(buckets[offset(i)]) val
from buckets_combinations c, unnest(combo) i, buckets_elements b
group by format('%t', c)
)
)
where rnk = 1
有输出