获取跨多个列的不同值的数量
Get amount of distinct values across multiple columns
我正在使用#standardSQL BigQuery 并希望能够跨列计算唯一值。
我想计算每个 visit_id 我的用户漏斗中不同值的数量,这些值存储在以下列中:first_pgroup、second、third、forth、fifth、sixth、seventh、第八,第九,第十。
我有以下table:
visit_id
first_pgroup
second
third
forth
fifth
sixth
seventh
eighth
ninth
tenth
1
cloth
cloth
cloth
cloth
cloth
cloth
cloth
cloth
cloth
cloth
2
shoes
cloth
beauty
3
beauty
4
home&living
cloth
home&living
shoes
accessories
5
shoes
shoes
shoes
shoes
shoes
shoes
我的目标是能够创建一个新列来说明每行中的唯一值。
期望的输出是:
这是我的查询:
> WITH
> config AS (
> SELECT
> --SET time frame
> DATE "2022-01-01" AS start_date,
> DATE "2022-01-01" AS end_date ),
>
>
> PDP_table AS (
> SELECT
> DISTINCT
> request_id AS request_id_PDP,
> eventType,
> SAFE_CAST(pid AS STRING) AS pid,
> SPLIT(pgroup , '/')[SAFE_OFFSET(0)] as product_group
>
> FROM
> `project.table`,
> config
> WHERE
> DATE(PARTITIONTIME) BETWEEN config.start_date
> AND config.end_date
> AND pid IS NOT NULL
> and eventType = "view" ),
>
> table AS ( SELECT
> timestamp,
> visit_id,
> request_id, FROM
> `project.table.2`,
> config WHERE
> DATE(PARTITIONTIME) BETWEEN config.start_date
> AND config.end_date
> AND site IN ('Live2',
> 'App'
> )AND country = 'de'),
> raw as ( SELECT
> cast((b.product_group) as string) AS first_pgroup,
> cast(LEAD( b.product_group, 1) OVER (PARTITION BY visit_id ORDER BY request_id) as string) AS second, cast( LEAD(
> b.product_group, 2) OVER (PARTITION BY visit_id ORDER BY request_id)as
> string) AS third,
> cast(LEAD( b.product_group, 3) OVER (PARTITION BY visit_id ORDER BY request_id)as string) AS forth,
> cast(LEAD( b.product_group, 4) OVER (PARTITION BY visit_id ORDER BY request_id) as string) AS fifth, cast( LEAD(
> b.product_group, 5) OVER (PARTITION BY visit_id ORDER BY request_id)
> as string) AS sixth, cast( LEAD( b.product_group, 6) OVER (PARTITION
> BY visit_id ORDER BY request_id) as string) AS seventh, cast( LEAD(
> b.product_group, 7) OVER (PARTITION BY visit_id ORDER BY request_id
> )as string) AS eighth, cast( LEAD( b.product_group, 8) OVER (PARTITION
> BY visit_id ORDER BY request_id) as string) AS ninth,
> cast(LEAD( b.product_group, 9) OVER (PARTITION BY visit_id ORDER BY request_id) as string) AS tenth FROM
> table a JOIN
> PDP_table b ON
> b.request_id_PDP = a.request_id
> )
> select *, (
> select count(distinct category) - 1
> from unnest(values(replace(to_json_string(t), 'null', '"null"'))) category
> where category != 'null' ) as category_count from raw
考虑以下方法
create temp function values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
select *, (
select count(distinct category) - 1
from unnest(values(replace(to_json_string(t), 'null', '"null"'))) category
where category != 'null'
) as category_count
from your_table t
如果应用于我们问题中的样本数据 - 输出是
如果您只想使用 sql 来解决这个问题,下面的方法非常简单并且适用于多个数据库
with main_table
as
(select visit_id,first_group group_value from table
union all
select visit_id,second_group from table
union all
select visit_id,third_group from table
union all
select visit_id,fourth_group from table
union all
select visit_id,fifth_group from table
union all
select visit_id,sixth_group from table
union all
select visit_id,seventh_group from table)
select s.*,count(distinct m.group_value) distint_count
from main_table m
left join table s
on m.visit_id=s.visit_id
以下方法基于 Mikahils 的回答:
select *, (select count(distinct category) from
unnest([first_pgroup,second,third,forth,fifth,sixth,seventh,eighth,ninth,tenth])
as category where category != 'null') as cat_count from raw
我正在使用#standardSQL BigQuery 并希望能够跨列计算唯一值。
我想计算每个 visit_id 我的用户漏斗中不同值的数量,这些值存储在以下列中:first_pgroup、second、third、forth、fifth、sixth、seventh、第八,第九,第十。
我有以下table:
visit_id | first_pgroup | second | third | forth | fifth | sixth | seventh | eighth | ninth | tenth |
---|---|---|---|---|---|---|---|---|---|---|
1 | cloth | cloth | cloth | cloth | cloth | cloth | cloth | cloth | cloth | cloth |
2 | shoes | cloth | beauty | |||||||
3 | beauty | |||||||||
4 | home&living | cloth | home&living | shoes | accessories | |||||
5 | shoes | shoes | shoes | shoes | shoes | shoes |
我的目标是能够创建一个新列来说明每行中的唯一值。
期望的输出是:
这是我的查询:
> WITH
> config AS (
> SELECT
> --SET time frame
> DATE "2022-01-01" AS start_date,
> DATE "2022-01-01" AS end_date ),
>
>
> PDP_table AS (
> SELECT
> DISTINCT
> request_id AS request_id_PDP,
> eventType,
> SAFE_CAST(pid AS STRING) AS pid,
> SPLIT(pgroup , '/')[SAFE_OFFSET(0)] as product_group
>
> FROM
> `project.table`,
> config
> WHERE
> DATE(PARTITIONTIME) BETWEEN config.start_date
> AND config.end_date
> AND pid IS NOT NULL
> and eventType = "view" ),
>
> table AS ( SELECT
> timestamp,
> visit_id,
> request_id, FROM
> `project.table.2`,
> config WHERE
> DATE(PARTITIONTIME) BETWEEN config.start_date
> AND config.end_date
> AND site IN ('Live2',
> 'App'
> )AND country = 'de'),
> raw as ( SELECT
> cast((b.product_group) as string) AS first_pgroup,
> cast(LEAD( b.product_group, 1) OVER (PARTITION BY visit_id ORDER BY request_id) as string) AS second, cast( LEAD(
> b.product_group, 2) OVER (PARTITION BY visit_id ORDER BY request_id)as
> string) AS third,
> cast(LEAD( b.product_group, 3) OVER (PARTITION BY visit_id ORDER BY request_id)as string) AS forth,
> cast(LEAD( b.product_group, 4) OVER (PARTITION BY visit_id ORDER BY request_id) as string) AS fifth, cast( LEAD(
> b.product_group, 5) OVER (PARTITION BY visit_id ORDER BY request_id)
> as string) AS sixth, cast( LEAD( b.product_group, 6) OVER (PARTITION
> BY visit_id ORDER BY request_id) as string) AS seventh, cast( LEAD(
> b.product_group, 7) OVER (PARTITION BY visit_id ORDER BY request_id
> )as string) AS eighth, cast( LEAD( b.product_group, 8) OVER (PARTITION
> BY visit_id ORDER BY request_id) as string) AS ninth,
> cast(LEAD( b.product_group, 9) OVER (PARTITION BY visit_id ORDER BY request_id) as string) AS tenth FROM
> table a JOIN
> PDP_table b ON
> b.request_id_PDP = a.request_id
> )
> select *, (
> select count(distinct category) - 1
> from unnest(values(replace(to_json_string(t), 'null', '"null"'))) category
> where category != 'null' ) as category_count from raw
考虑以下方法
create temp function values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
select *, (
select count(distinct category) - 1
from unnest(values(replace(to_json_string(t), 'null', '"null"'))) category
where category != 'null'
) as category_count
from your_table t
如果应用于我们问题中的样本数据 - 输出是
如果您只想使用 sql 来解决这个问题,下面的方法非常简单并且适用于多个数据库
with main_table
as
(select visit_id,first_group group_value from table
union all
select visit_id,second_group from table
union all
select visit_id,third_group from table
union all
select visit_id,fourth_group from table
union all
select visit_id,fifth_group from table
union all
select visit_id,sixth_group from table
union all
select visit_id,seventh_group from table)
select s.*,count(distinct m.group_value) distint_count
from main_table m
left join table s
on m.visit_id=s.visit_id
以下方法基于 Mikahils 的回答:
select *, (select count(distinct category) from
unnest([first_pgroup,second,third,forth,fifth,sixth,seventh,eighth,ninth,tenth])
as category where category != 'null') as cat_count from raw