在 Bigquery 中取消嵌套多个数组并再次聚合
Unnest multiple arrays in Bigquery and aggregate again
我正在尝试取消嵌套 Bigquery 中的多个嵌套数组,过滤它们并将新数组放回一起。我的问题是,我最终得到了重复的值。
示例数据:
图像有两个示例数据行,数组 "vendor" 包含两个数组 "topic" 和 "categories"
我想根据 vendor.topic.score >= 0.8、vendor.categories.score >= 0.8 进行过滤,同时删除 vendor.topic.position 列。
结果应该是这样的:
首先,我尝试用每个数组的多个 unnest 来解决它,但这在新创建的数组中给了我重复的值:
SELECT
id,
ARRAY_AGG(STRUCT(vendor_topics.label AS topics_label,
vendor_topics.score AS topics_score)),
ARRAY_AGG(STRUCT(vendor_categories.label AS category_label,
vendor_categories.score AS category_score))
FROM
`source_table`,
UNNEST(vendor.topics) vendor_topics,
UNNEST(vendor.categories) vendor_categories
WHERE
AND vendor_categories.score >= 0.8
AND vendor_topics.score >= 0.8
GROUP BY
1
LIMIT
10
接下来我尝试使用子查询,结果是 "API limit exceeded: Unable to return a row that exceeds the API limits. To retrieve the row, export the table."
SELECT
id,
(
SELECT
ARRAY_AGG(STRUCT(vendor_topics.label AS topics_label,
vendor_topics.score AS topics_score))
FROM
`source_table` articles,
UNNEST(vendor.topics) vendor_topics
WHERE
vendor_topics.score >= 0.8),
(
SELECT
ARRAY_AGG(STRUCT(vendor_categories.label AS category_label,
vendor_categories.score AS category_score))
FROM
`source_table`,
UNNEST(vendor.categories) vendor_categories
WHERE
vendor_categories.score >= 0.8)
FROM
`source_table`
GROUP BY
1
现在我没主意了,希望有人能帮我解决这个问题。
我也通过两种方式构建了您的样本数据,因为不确定 vendor 是数组还是不是数组。您可能会因此而出现并发症。
第一个示例供应商是数组
#standardSQL
WITH `yourTable` AS (
select 111 as id, (select
array(select
struct(array(select struct('A' as label, 0.1 as score,2 as position)
union all select struct('B' as label, 0.9 as score,5 as position)
union all select struct('C' as label, 0.9 as score,8 as position)
) as topic,
array(select struct('Cat1' as label, 0.8 as score)
union all select struct('Cat2' as label, 0.3 as score)
) as categories
)
)) as vendor
union all
select 222 as id, (select
array(select
struct(array(select struct('X' as label, 0.3 as score,2 as position)
union all select struct('Y' as label, 0.9 as score,3 as position)
) as topic,
array(select struct('Cat33' as label, 0.9 as score)
union all select struct('Cat99' as label, 0.4 as score)
union all select struct('Cat44' as label, 0.85 as score)
) as categories
)
)) as vendor
)
------
SELECT id,array(
select struct(
(select array_agg(t) as topic from unnest(vendor),unnest(topic) t where t.score>=0.8) as topic,
(select array_agg(t) as categories from unnest(vendor),unnest(categories) t where t.score>=0.8) as categories
)
) as vendor2 from yourTable
这个returns:
基本上您需要阅读的方式是:
- 您正在选择包含 id
和 vendor2
的行
- 本质上 vendor2
是一个数组(第二个例子跳过这个)
- 然后你需要两个键作为结构 topic
和 categories
- topic
或 categories
是一个结构数组。
第二个例子(供应商不是数组):
#standardSQL
WITH `yourTable` AS (
select 111 as id, (select
struct(array(select struct('A' as label, 0.1 as score,2 as position)
union all select struct('B' as label, 0.9 as score,5 as position)
union all select struct('C' as label, 0.9 as score,8 as position)
) as topic,
array(select struct('Cat1' as label, 0.8 as score)
union all select struct('Cat2' as label, 0.3 as score)
) as categories
)
) as vendor
union all
select 222 as id, (select
struct(array(select struct('X' as label, 0.3 as score,2 as position)
union all select struct('Y' as label, 0.9 as score,3 as position)
) as topic,
array(select struct('Cat33' as label, 0.9 as score)
union all select struct('Cat99' as label, 0.4 as score)
union all select struct('Cat44' as label, 0.85 as score)
) as categories
)
) as vendor
)
SELECT id,struct(
(select array_agg(t) as topic from unnest(vendor.topic) t where t.score>=0.8) as topic,
(select array_agg(t) as categories from unnest(vendor.categories) t where t.score>=0.8) as categories
) as vendor2 from yourTable
我正在尝试取消嵌套 Bigquery 中的多个嵌套数组,过滤它们并将新数组放回一起。我的问题是,我最终得到了重复的值。
示例数据:
图像有两个示例数据行,数组 "vendor" 包含两个数组 "topic" 和 "categories"
我想根据 vendor.topic.score >= 0.8、vendor.categories.score >= 0.8 进行过滤,同时删除 vendor.topic.position 列。
结果应该是这样的:
首先,我尝试用每个数组的多个 unnest 来解决它,但这在新创建的数组中给了我重复的值:
SELECT
id,
ARRAY_AGG(STRUCT(vendor_topics.label AS topics_label,
vendor_topics.score AS topics_score)),
ARRAY_AGG(STRUCT(vendor_categories.label AS category_label,
vendor_categories.score AS category_score))
FROM
`source_table`,
UNNEST(vendor.topics) vendor_topics,
UNNEST(vendor.categories) vendor_categories
WHERE
AND vendor_categories.score >= 0.8
AND vendor_topics.score >= 0.8
GROUP BY
1
LIMIT
10
接下来我尝试使用子查询,结果是 "API limit exceeded: Unable to return a row that exceeds the API limits. To retrieve the row, export the table."
SELECT
id,
(
SELECT
ARRAY_AGG(STRUCT(vendor_topics.label AS topics_label,
vendor_topics.score AS topics_score))
FROM
`source_table` articles,
UNNEST(vendor.topics) vendor_topics
WHERE
vendor_topics.score >= 0.8),
(
SELECT
ARRAY_AGG(STRUCT(vendor_categories.label AS category_label,
vendor_categories.score AS category_score))
FROM
`source_table`,
UNNEST(vendor.categories) vendor_categories
WHERE
vendor_categories.score >= 0.8)
FROM
`source_table`
GROUP BY
1
现在我没主意了,希望有人能帮我解决这个问题。
我也通过两种方式构建了您的样本数据,因为不确定 vendor 是数组还是不是数组。您可能会因此而出现并发症。
第一个示例供应商是数组
#standardSQL
WITH `yourTable` AS (
select 111 as id, (select
array(select
struct(array(select struct('A' as label, 0.1 as score,2 as position)
union all select struct('B' as label, 0.9 as score,5 as position)
union all select struct('C' as label, 0.9 as score,8 as position)
) as topic,
array(select struct('Cat1' as label, 0.8 as score)
union all select struct('Cat2' as label, 0.3 as score)
) as categories
)
)) as vendor
union all
select 222 as id, (select
array(select
struct(array(select struct('X' as label, 0.3 as score,2 as position)
union all select struct('Y' as label, 0.9 as score,3 as position)
) as topic,
array(select struct('Cat33' as label, 0.9 as score)
union all select struct('Cat99' as label, 0.4 as score)
union all select struct('Cat44' as label, 0.85 as score)
) as categories
)
)) as vendor
)
------
SELECT id,array(
select struct(
(select array_agg(t) as topic from unnest(vendor),unnest(topic) t where t.score>=0.8) as topic,
(select array_agg(t) as categories from unnest(vendor),unnest(categories) t where t.score>=0.8) as categories
)
) as vendor2 from yourTable
这个returns:
基本上您需要阅读的方式是:
- 您正在选择包含 id
和 vendor2
的行
- 本质上 vendor2
是一个数组(第二个例子跳过这个)
- 然后你需要两个键作为结构 topic
和 categories
- topic
或 categories
是一个结构数组。
第二个例子(供应商不是数组):
#standardSQL
WITH `yourTable` AS (
select 111 as id, (select
struct(array(select struct('A' as label, 0.1 as score,2 as position)
union all select struct('B' as label, 0.9 as score,5 as position)
union all select struct('C' as label, 0.9 as score,8 as position)
) as topic,
array(select struct('Cat1' as label, 0.8 as score)
union all select struct('Cat2' as label, 0.3 as score)
) as categories
)
) as vendor
union all
select 222 as id, (select
struct(array(select struct('X' as label, 0.3 as score,2 as position)
union all select struct('Y' as label, 0.9 as score,3 as position)
) as topic,
array(select struct('Cat33' as label, 0.9 as score)
union all select struct('Cat99' as label, 0.4 as score)
union all select struct('Cat44' as label, 0.85 as score)
) as categories
)
) as vendor
)
SELECT id,struct(
(select array_agg(t) as topic from unnest(vendor.topic) t where t.score>=0.8) as topic,
(select array_agg(t) as categories from unnest(vendor.categories) t where t.score>=0.8) as categories
) as vendor2 from yourTable