在大查询中进行分层抽样的有效方法
efficient way to do stratified sampling in big query
我有一个名为 TableA 的 Google 大查询 table,它有大约 3M 条记录。有一个名为 DimA(维度 A)的列,它有 20 个值 - 1 到 20。DimA 的每个值的计数显示在在下面的 总计 列中汇总 table。我做了一些分析并确定了我应该从 DimA 的每个值中抽取多少随机样本,它显示在列 Sample 中。 DimA 的每个值抽取的样本百分比显示在列 DimA_value_perc 中。我知道如何使用 table 下面的代码通过蛮力进行采样。但是,随着 DimA 值数量的增加以及存在其他维度,此代码不可扩展。有没有更有效的分层抽样方法?谢谢
DimA
Total
Sample
DimA_value_perc
1
115,623
3,077
3%
2
108,203
3,943
4%
3
153,477
6,802
4%
4
232,252
12,426
5%
5
223,004
14,052
6%
6
242,386
17,589
7%
7
121,519
9,783
8%
8
371,342
34,026
9%
9
147,683
15,400
10%
10
281,101
32,775
12%
11
93,380
12,075
13%
12
181,293
25,675
14%
13
122,206
19,344
16%
14
140,559
25,141
18%
15
95,576
19,498
20%
16
94,319
21,969
23%
17
108,282
30,054
28%
18
94,920
33,228
35%
19
82,764
39,700
48%
20
28,417
23,442
82%
Grand Total
3,038,306
400,000
SELECT *
FROM tableA
where DimA = 1
order by rand()
limit 3077
union all
SELECT *
FROM tableA
where DimA = 2
order by rand()
limit 3943
etc
考虑以下方法
with samples as (
select 1 DimA, 115623 Total, 3077 Sample, 3 DimA_value_perc union all
select 2, 108203, 3943, 4 union all
select 3, 153477, 6802, 4 union all
select 4, 232252, 12426, 5 union all
select 5, 223004, 14052, 6 union all
select 6, 242386, 17589, 7 union all
select 7, 121519, 9783, 8 union all
select 8, 371342, 34026, 9 union all
select 9, 147683, 15400, 10 union all
select 10, 281101, 32775, 12 union all
select 11, 93380, 12075, 13 union all
select 12, 181293, 25675, 14 union all
select 13, 122206, 19344, 16 union all
select 14, 140559, 25141, 18 union all
select 15, 95576, 19498, 20 union all
select 16, 94319, 21969, 23 union all
select 17, 108282, 30054, 28 union all
select 18, 94920, 33228, 35 union all
select 19, 82764, 39700, 48 union all
select 20, 28417, 23442, 82
)
select a.* except(pos) from (
select *, row_number() over(partition by DimA order by rand()) pos
from tableA
) a
join samples
using(DimA)
where pos <= Sample
我有一个名为 TableA 的 Google 大查询 table,它有大约 3M 条记录。有一个名为 DimA(维度 A)的列,它有 20 个值 - 1 到 20。DimA 的每个值的计数显示在在下面的 总计 列中汇总 table。我做了一些分析并确定了我应该从 DimA 的每个值中抽取多少随机样本,它显示在列 Sample 中。 DimA 的每个值抽取的样本百分比显示在列 DimA_value_perc 中。我知道如何使用 table 下面的代码通过蛮力进行采样。但是,随着 DimA 值数量的增加以及存在其他维度,此代码不可扩展。有没有更有效的分层抽样方法?谢谢
DimA | Total | Sample | DimA_value_perc |
---|---|---|---|
1 | 115,623 | 3,077 | 3% |
2 | 108,203 | 3,943 | 4% |
3 | 153,477 | 6,802 | 4% |
4 | 232,252 | 12,426 | 5% |
5 | 223,004 | 14,052 | 6% |
6 | 242,386 | 17,589 | 7% |
7 | 121,519 | 9,783 | 8% |
8 | 371,342 | 34,026 | 9% |
9 | 147,683 | 15,400 | 10% |
10 | 281,101 | 32,775 | 12% |
11 | 93,380 | 12,075 | 13% |
12 | 181,293 | 25,675 | 14% |
13 | 122,206 | 19,344 | 16% |
14 | 140,559 | 25,141 | 18% |
15 | 95,576 | 19,498 | 20% |
16 | 94,319 | 21,969 | 23% |
17 | 108,282 | 30,054 | 28% |
18 | 94,920 | 33,228 | 35% |
19 | 82,764 | 39,700 | 48% |
20 | 28,417 | 23,442 | 82% |
Grand Total | 3,038,306 | 400,000 |
SELECT *
FROM tableA
where DimA = 1
order by rand()
limit 3077
union all
SELECT *
FROM tableA
where DimA = 2
order by rand()
limit 3943
etc
考虑以下方法
with samples as (
select 1 DimA, 115623 Total, 3077 Sample, 3 DimA_value_perc union all
select 2, 108203, 3943, 4 union all
select 3, 153477, 6802, 4 union all
select 4, 232252, 12426, 5 union all
select 5, 223004, 14052, 6 union all
select 6, 242386, 17589, 7 union all
select 7, 121519, 9783, 8 union all
select 8, 371342, 34026, 9 union all
select 9, 147683, 15400, 10 union all
select 10, 281101, 32775, 12 union all
select 11, 93380, 12075, 13 union all
select 12, 181293, 25675, 14 union all
select 13, 122206, 19344, 16 union all
select 14, 140559, 25141, 18 union all
select 15, 95576, 19498, 20 union all
select 16, 94319, 21969, 23 union all
select 17, 108282, 30054, 28 union all
select 18, 94920, 33228, 35 union all
select 19, 82764, 39700, 48 union all
select 20, 28417, 23442, 82
)
select a.* except(pos) from (
select *, row_number() over(partition by DimA order by rand()) pos
from tableA
) a
join samples
using(DimA)
where pos <= Sample