青梅分布
Greenplum distribution
有一个 table 随机分布
CREATE TABLE schema.table (
col1 int4 NULL,
col2 int4 NULL,
col3 int4 NULL
)
WITH (
appendonly=true,
compresstype=zstd,
orientation=column
)
DISTRIBUTED RANDOMLY;
我们需要以最佳方式(以最小偏差)将行分布在一个字段上。为此,我们可以创建测试 tables
CREATE TABLE schema.test_table (
col_1 int4 NULL,
col_2 int4 NULL,
col_3 int4 NULL
)
WITH (
appendonly=true,
compresstype=zstd,
orientation=column
)
DISTRIBUTED BY (col_i);
INSERT INTO schema.test_table SELECT * FROM schema.table;
然后检查它们是否有偏差,例如通过
select * from gp_toolkit.gp_skew_coefficient('schema.test_table'::regclass);
问题是我们想在不创建测试 table 的情况下检查 table 是否存在偏差。可以这样做吗?如果可以,怎么做?
如果您真的不想创建新的 table,这样的方法会起作用,但我不知道有什么方法可以在不实际(重新)分发数据的情况下对分布进行建模。
foo=# create table foo(a int, b int, c int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE
foo=# insert into foo values (generate_series(1,100), generate_series(101,200), generate_series(2001, 2100));
INSERT 0 100
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 18.460769214742921763000
(1 row)
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 17
1 | 18
2 | 23
3 | 17
4 | 15
5 | 10
(6 rows)
foo=# ALTER TABLE foo SET
foo-# WITH (REORGANIZE=true)
foo-# DISTRIBUTED BY (a);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 18.460769214742921763000
(1 row)
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 21
1 | 18
2 | 12
3 | 15
4 | 18
5 | 16
(6 rows)
foo=#
foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (b);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 27.011108825814611346000
(1 row)
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 12
1 | 14
2 | 20
3 | 24
4 | 16
5 | 14
(6 rows)
foo=#
foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (c);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 30.983866769659334938000
(1 row)
foo=#
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 19
1 | 10
2 | 20
3 | 23
4 | 11
5 | 17
(6 rows)
foo=#
有一个 table 随机分布
CREATE TABLE schema.table (
col1 int4 NULL,
col2 int4 NULL,
col3 int4 NULL
)
WITH (
appendonly=true,
compresstype=zstd,
orientation=column
)
DISTRIBUTED RANDOMLY;
我们需要以最佳方式(以最小偏差)将行分布在一个字段上。为此,我们可以创建测试 tables
CREATE TABLE schema.test_table (
col_1 int4 NULL,
col_2 int4 NULL,
col_3 int4 NULL
)
WITH (
appendonly=true,
compresstype=zstd,
orientation=column
)
DISTRIBUTED BY (col_i);
INSERT INTO schema.test_table SELECT * FROM schema.table;
然后检查它们是否有偏差,例如通过
select * from gp_toolkit.gp_skew_coefficient('schema.test_table'::regclass);
问题是我们想在不创建测试 table 的情况下检查 table 是否存在偏差。可以这样做吗?如果可以,怎么做?
如果您真的不想创建新的 table,这样的方法会起作用,但我不知道有什么方法可以在不实际(重新)分发数据的情况下对分布进行建模。
foo=# create table foo(a int, b int, c int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE
foo=# insert into foo values (generate_series(1,100), generate_series(101,200), generate_series(2001, 2100));
INSERT 0 100
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 18.460769214742921763000
(1 row)
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 17
1 | 18
2 | 23
3 | 17
4 | 15
5 | 10
(6 rows)
foo=# ALTER TABLE foo SET
foo-# WITH (REORGANIZE=true)
foo-# DISTRIBUTED BY (a);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 18.460769214742921763000
(1 row)
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 21
1 | 18
2 | 12
3 | 15
4 | 18
5 | 16
(6 rows)
foo=#
foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (b);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 27.011108825814611346000
(1 row)
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 12
1 | 14
2 | 20
3 | 24
4 | 16
5 | 14
(6 rows)
foo=#
foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (c);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
skcoid | skccoeff
--------+--------------------------
76788 | 30.983866769659334938000
(1 row)
foo=#
foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
gp_segment_id | count
---------------+-------
0 | 19
1 | 10
2 | 20
3 | 23
4 | 11
5 | 17
(6 rows)
foo=#