group_concat 函数删除重复项
group_concat function removing duplicates
在 BigQuery 中创建了以下查询:
SELECT
date,
userId,
SUM(totals.visits) totalvisits,
GROUP_CONCAT(device.deviceCategory) sequentialdevice
FROM (
SELECT
date,
visitStartTime,
customDimensions.value userId,
totals.visits,
device.deviceCategory
FROM
TABLE_DATE_RANGE([164345793.ga_sessions_], TIMESTAMP('20171127'), CURRENT_TIMESTAMP())
WHERE
customDimensions.index = 1
AND customDimensions.value CONTAINS "hip|"
GROUP BY
date,
visitStartTime,
userId,
totals.visits,
device.deviceCategory
HAVING
userId="hip|7e4fbce9-bbfb-4677-aab0-dcd02851fdb4"
ORDER BY
date ASC,
visitStartTime ASC)
GROUP BY
date,
userId
作为临时措施,我正在使用 having 子句对其进行测试(这将在生产中删除)查询输出如下:
这一切都很好并且按预期工作,以适当的顺序输出设备(平板电脑、平板电脑、平板电脑、移动设备、台式机)- 但是,我想从中删除重复项,因此结果将是 "tablet,mobile,desktop"
我尝试使用 Unique() 函数,这会删除重复项,但是不会保留顺序,因此输出变为 "desktop,mobile,tablet"
如有任何帮助,我们将不胜感激!
更新
我将查询更新为标准 SQL,现在我在使用 string_agg() 函数时遇到另一个问题:
SELECT
date,
userId,
totalsvisits,
STRING_AGG(DISTINCT devicecategory ORDER BY date ASC, vstime ASC) deviceAgg
FROM (
SELECT
date,
visitStartTime vstime,
cd.value userId,
totals.visits totalsvisits,
device.deviceCategory devicecategory
FROM
`12314124123123.ga_sessions_*`,
UNNEST(customDimensions) AS cd
WHERE
cd.index=1
AND cd.value IS NOT NULL
GROUP BY
date,
visitStartTime,
userId,
totals.visits,
device.deviceCategory
HAVING
userId="hip|7e4fbce9-bbfb-4677-aab0-dcd02851fdb4"
ORDER BY
date ASC,
visitStartTime ASC)
GROUP BY
date,
userId,
totalsvisits
返回的错误是"An aggregate function that has both DISTINCT and ORDER BY arguments can only ORDER BY columns that are arguments to the function"
显然,如果我们从 string_agg 中删除 distinct 或 order by 子句,这是有效的,但我们需要这两个操作。
对于更新后的问题,以下查询会产生相同的错误:
SELECT age_midpoint, STRING_AGG(DISTINCT country ORDER BY c DESC)
FROM (
SELECT country, age_midpoint, COUNT(*) c
FROM `fh-bigquery.Whosebug.survey_results_2016`
WHERE age_midpoint IS NOT null
AND country LIKE '%u%'
GROUP BY 1, 2
)
GROUP BY 1
ORDER BY 1
这个限制是有道理的,因为一旦你 运行 DISTINCT
,你就失去了对你想要提供订单的变量的可见性。
试试这个:
#standardSQL
SELECT age_midpoint, ARRAY_TO_STRING(ARRAY(
SELECT country FROM (SELECT country, c FROM UNNEST(arr) GROUP BY country, c) ORDER BY c DESC
), ',')
FROM (
SELECT age_midpoint, ARRAY_AGG(STRUCT(country, c)) arr
FROM (
SELECT country, age_midpoint, COUNT(*) c
FROM `fh-bigquery.Whosebug.survey_results_2016`
WHERE age_midpoint IS NOT null
AND country LIKE '%u%'
GROUP BY 1, 2
)
GROUP BY 1
)
ORDER BY 1
LIMIT 1000
(参见https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#creating-arrays-from-subqueries)
感谢 Felipe,这是完成的查询!
SELECT
date, value, SUM(visits) visits, STRING_AGG(DISTINCT seqdevice) seqdevice, COUNT(DISTINCT seqdevice) countseqdevice
FROM (
SELECT date, value, visits, ARRAY_TO_STRING(ARRAY(
SELECT deviceCategory FROM (SELECT deviceCategory, c FROM UNNEST(arr) GROUP BY deviceCategory, c) ORDER BY c DESC
), ',') seqdevice
FROM (
SELECT date, visitStartTime, value, visits, ARRAY_AGG(STRUCT(deviceCategory, c)) arr
FROM (
SELECT date, visitStartTime, cd.value value, totals.visits visits, device.deviceCategory deviceCategory, COUNT(*) c
FROM `xxxxxxxxxx`, UNNEST(customDimensions) AS cd
WHERE cd.index=1 AND STARTS_WITH(cd.value,"hip|")
GROUP BY 1, 2, 3, 4, 5
)
GROUP BY 1, 2, 3, 4
)
ORDER BY 2)
GROUP BY 1, 2
HAVING
value="hip|7e4fbce9-bbfb-4677-aab0-dcd02851fdb4"
ORDER BY countseqdevice desc
在 BigQuery 中创建了以下查询:
SELECT
date,
userId,
SUM(totals.visits) totalvisits,
GROUP_CONCAT(device.deviceCategory) sequentialdevice
FROM (
SELECT
date,
visitStartTime,
customDimensions.value userId,
totals.visits,
device.deviceCategory
FROM
TABLE_DATE_RANGE([164345793.ga_sessions_], TIMESTAMP('20171127'), CURRENT_TIMESTAMP())
WHERE
customDimensions.index = 1
AND customDimensions.value CONTAINS "hip|"
GROUP BY
date,
visitStartTime,
userId,
totals.visits,
device.deviceCategory
HAVING
userId="hip|7e4fbce9-bbfb-4677-aab0-dcd02851fdb4"
ORDER BY
date ASC,
visitStartTime ASC)
GROUP BY
date,
userId
作为临时措施,我正在使用 having 子句对其进行测试(这将在生产中删除)查询输出如下:
这一切都很好并且按预期工作,以适当的顺序输出设备(平板电脑、平板电脑、平板电脑、移动设备、台式机)- 但是,我想从中删除重复项,因此结果将是 "tablet,mobile,desktop"
我尝试使用 Unique() 函数,这会删除重复项,但是不会保留顺序,因此输出变为 "desktop,mobile,tablet"
如有任何帮助,我们将不胜感激!
更新
我将查询更新为标准 SQL,现在我在使用 string_agg() 函数时遇到另一个问题:
SELECT
date,
userId,
totalsvisits,
STRING_AGG(DISTINCT devicecategory ORDER BY date ASC, vstime ASC) deviceAgg
FROM (
SELECT
date,
visitStartTime vstime,
cd.value userId,
totals.visits totalsvisits,
device.deviceCategory devicecategory
FROM
`12314124123123.ga_sessions_*`,
UNNEST(customDimensions) AS cd
WHERE
cd.index=1
AND cd.value IS NOT NULL
GROUP BY
date,
visitStartTime,
userId,
totals.visits,
device.deviceCategory
HAVING
userId="hip|7e4fbce9-bbfb-4677-aab0-dcd02851fdb4"
ORDER BY
date ASC,
visitStartTime ASC)
GROUP BY
date,
userId,
totalsvisits
返回的错误是"An aggregate function that has both DISTINCT and ORDER BY arguments can only ORDER BY columns that are arguments to the function"
显然,如果我们从 string_agg 中删除 distinct 或 order by 子句,这是有效的,但我们需要这两个操作。
对于更新后的问题,以下查询会产生相同的错误:
SELECT age_midpoint, STRING_AGG(DISTINCT country ORDER BY c DESC)
FROM (
SELECT country, age_midpoint, COUNT(*) c
FROM `fh-bigquery.Whosebug.survey_results_2016`
WHERE age_midpoint IS NOT null
AND country LIKE '%u%'
GROUP BY 1, 2
)
GROUP BY 1
ORDER BY 1
这个限制是有道理的,因为一旦你 运行 DISTINCT
,你就失去了对你想要提供订单的变量的可见性。
试试这个:
#standardSQL
SELECT age_midpoint, ARRAY_TO_STRING(ARRAY(
SELECT country FROM (SELECT country, c FROM UNNEST(arr) GROUP BY country, c) ORDER BY c DESC
), ',')
FROM (
SELECT age_midpoint, ARRAY_AGG(STRUCT(country, c)) arr
FROM (
SELECT country, age_midpoint, COUNT(*) c
FROM `fh-bigquery.Whosebug.survey_results_2016`
WHERE age_midpoint IS NOT null
AND country LIKE '%u%'
GROUP BY 1, 2
)
GROUP BY 1
)
ORDER BY 1
LIMIT 1000
(参见https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#creating-arrays-from-subqueries)
感谢 Felipe,这是完成的查询!
SELECT
date, value, SUM(visits) visits, STRING_AGG(DISTINCT seqdevice) seqdevice, COUNT(DISTINCT seqdevice) countseqdevice
FROM (
SELECT date, value, visits, ARRAY_TO_STRING(ARRAY(
SELECT deviceCategory FROM (SELECT deviceCategory, c FROM UNNEST(arr) GROUP BY deviceCategory, c) ORDER BY c DESC
), ',') seqdevice
FROM (
SELECT date, visitStartTime, value, visits, ARRAY_AGG(STRUCT(deviceCategory, c)) arr
FROM (
SELECT date, visitStartTime, cd.value value, totals.visits visits, device.deviceCategory deviceCategory, COUNT(*) c
FROM `xxxxxxxxxx`, UNNEST(customDimensions) AS cd
WHERE cd.index=1 AND STARTS_WITH(cd.value,"hip|")
GROUP BY 1, 2, 3, 4, 5
)
GROUP BY 1, 2, 3, 4
)
ORDER BY 2)
GROUP BY 1, 2
HAVING
value="hip|7e4fbce9-bbfb-4677-aab0-dcd02851fdb4"
ORDER BY countseqdevice desc