优化 Snowflake 中的时间戳列 table
Optimizing for timestamp column in Snowflake table
我有这个 table 有 103.4B 行(大约 1 TB)的数据:
create or replace TABLE SPOT_DIFFDEPTH cluster by (date_trunc('hour', timestamp))(
TRADE_PAIR VARCHAR(25) NOT NULL,
TIMESTAMP TIMESTAMP_NTZ(9) NOT NULL,
ORDERBOOK_SIDE VARCHAR(5) NOT NULL,
PRICE NUMBER(38,8) NOT NULL,
QUANTITY NUMBER(38,8),
primary key (TRADE_PAIR, TIMESTAMP, ORDERBOOK_SIDE, PRICE)
);
注意:主键是由我工作场所的前数据工程师设置的。我最近刚刚添加了集群(通过 ALTER TABLE SPOT_DIFFDEPTH CLUSTER BY (date_trunc('hour', timestamp));
以优化下面的慢速查询)
对于管道监控,我想经常(每 3 小时)运行 有效地进行这样的查询
select date_trunc('HOUR', timestamp) as time,
count(timestamp) as count_
from SPOT_DIFFDEPTH
WHERE datediff(hour, timestamp, current_timestamp()) < 5
group by time;
但是现在在X-Small Spark仓库上需要1个多小时。上述查询的EXPLAIN
为:
| step | id | parent | operation | objects | alias | expressions | partitionsTotal | partitionsAssigned | bytesAssigned |
|------+----+--------+-------------+----------------+-------+---------------------------------------------------------------------------------------------------------------+-----------------+--------------------+---------------|
| | | | GlobalStats | | | | 63899 | 63899 | 1028495659520 |
| 1 | 0 | | Result | | | TRUNCTIMESTAMPTOHOUR(SPOT_DIFFDEPTH.TIMESTAMP), COUNT(SPOT_DIFFDEPTH.TIMESTAMP) | | | |
| 1 | 1 | 0 | Aggregate | | | aggExprs: [COUNT(SPOT_DIFFDEPTH.TIMESTAMP)], groupKeys: [TRUNCTIMESTAMPTOHOUR(SPOT_DIFFDEPTH.TIMESTAMP)] | | | |
| 1 | 2 | 1 | Filter | | | (DATE_DIFFTIMESTAMPINHOURS(TO_TIMESTAMP_LTZ(SPOT_DIFFDEPTH.TIMESTAMP), '2022-04-06 12:49:54.064000000Z')) < 5 | | | |
| 1 | 3 | 2 | TableScan | SPOT_DIFFDEPTH | | TIMESTAMP | 63899 | 63899 | 1028495659520 |
有什么提高性能的想法吗?是否可以有最近 24 小时的分区,这样我就不必每次都查询整个 TB 的数据(BTW 正在迅速增加)?
数据通过 Task + Stream 插入此 table - 也许我可以添加类似的任务写入单独的 table,仅包含最新的 24 小时数据?不过,我不确定如何丢弃他的旧数据。
集群信息
SELECT SYSTEM$CLUSTERING_INFORMATION('SPOT_DIFFDEPTH');
的结果:
{
"cluster_by_keys" : "LINEAR(date_trunc('hour', timestamp))",
"total_partition_count" : 63867,
"total_constant_partition_count" : 56576,
"average_overlaps" : 0.8238,
"average_depth" : 1.5217,
"partition_depth_histogram" : {
"00000" : 0,
"00001" : 56314,
"00002" : 509,
"00003" : 998,
"00004" : 1343,
"00005" : 1487,
"00006" : 1159,
"00007" : 963,
"00008" : 483,
"00009" : 307,
"00010" : 113,
"00011" : 53,
"00012" : 39,
"00013" : 11,
"00014" : 0,
"00015" : 0,
"00016" : 88
}
}
我认为您的 SQL WHERE 子句需要反映您的 clustering/partitioning 以便 Snowflake 能够有效地修剪分区,例如
select date_trunc('HOUR', timestamp) as time,
count(timestamp) as count_
from SPOT_DIFFDEPTH
WHERE date_trunc('hour', timestamp) >= current_date() - 1 DAY -- filter for last 2 days that aligns to the partitions
AND datediff(hour, timestamp, current_timestamp()) < 5 --refining filter that delivers the subset of data actually required
group by time;
我有这个 table 有 103.4B 行(大约 1 TB)的数据:
create or replace TABLE SPOT_DIFFDEPTH cluster by (date_trunc('hour', timestamp))(
TRADE_PAIR VARCHAR(25) NOT NULL,
TIMESTAMP TIMESTAMP_NTZ(9) NOT NULL,
ORDERBOOK_SIDE VARCHAR(5) NOT NULL,
PRICE NUMBER(38,8) NOT NULL,
QUANTITY NUMBER(38,8),
primary key (TRADE_PAIR, TIMESTAMP, ORDERBOOK_SIDE, PRICE)
);
注意:主键是由我工作场所的前数据工程师设置的。我最近刚刚添加了集群(通过 ALTER TABLE SPOT_DIFFDEPTH CLUSTER BY (date_trunc('hour', timestamp));
以优化下面的慢速查询)
对于管道监控,我想经常(每 3 小时)运行 有效地进行这样的查询
select date_trunc('HOUR', timestamp) as time,
count(timestamp) as count_
from SPOT_DIFFDEPTH
WHERE datediff(hour, timestamp, current_timestamp()) < 5
group by time;
但是现在在X-Small Spark仓库上需要1个多小时。上述查询的EXPLAIN
为:
| step | id | parent | operation | objects | alias | expressions | partitionsTotal | partitionsAssigned | bytesAssigned |
|------+----+--------+-------------+----------------+-------+---------------------------------------------------------------------------------------------------------------+-----------------+--------------------+---------------|
| | | | GlobalStats | | | | 63899 | 63899 | 1028495659520 |
| 1 | 0 | | Result | | | TRUNCTIMESTAMPTOHOUR(SPOT_DIFFDEPTH.TIMESTAMP), COUNT(SPOT_DIFFDEPTH.TIMESTAMP) | | | |
| 1 | 1 | 0 | Aggregate | | | aggExprs: [COUNT(SPOT_DIFFDEPTH.TIMESTAMP)], groupKeys: [TRUNCTIMESTAMPTOHOUR(SPOT_DIFFDEPTH.TIMESTAMP)] | | | |
| 1 | 2 | 1 | Filter | | | (DATE_DIFFTIMESTAMPINHOURS(TO_TIMESTAMP_LTZ(SPOT_DIFFDEPTH.TIMESTAMP), '2022-04-06 12:49:54.064000000Z')) < 5 | | | |
| 1 | 3 | 2 | TableScan | SPOT_DIFFDEPTH | | TIMESTAMP | 63899 | 63899 | 1028495659520 |
有什么提高性能的想法吗?是否可以有最近 24 小时的分区,这样我就不必每次都查询整个 TB 的数据(BTW 正在迅速增加)?
数据通过 Task + Stream 插入此 table - 也许我可以添加类似的任务写入单独的 table,仅包含最新的 24 小时数据?不过,我不确定如何丢弃他的旧数据。
集群信息
SELECT SYSTEM$CLUSTERING_INFORMATION('SPOT_DIFFDEPTH');
的结果:
{
"cluster_by_keys" : "LINEAR(date_trunc('hour', timestamp))",
"total_partition_count" : 63867,
"total_constant_partition_count" : 56576,
"average_overlaps" : 0.8238,
"average_depth" : 1.5217,
"partition_depth_histogram" : {
"00000" : 0,
"00001" : 56314,
"00002" : 509,
"00003" : 998,
"00004" : 1343,
"00005" : 1487,
"00006" : 1159,
"00007" : 963,
"00008" : 483,
"00009" : 307,
"00010" : 113,
"00011" : 53,
"00012" : 39,
"00013" : 11,
"00014" : 0,
"00015" : 0,
"00016" : 88
}
}
我认为您的 SQL WHERE 子句需要反映您的 clustering/partitioning 以便 Snowflake 能够有效地修剪分区,例如
select date_trunc('HOUR', timestamp) as time,
count(timestamp) as count_
from SPOT_DIFFDEPTH
WHERE date_trunc('hour', timestamp) >= current_date() - 1 DAY -- filter for last 2 days that aligns to the partitions
AND datediff(hour, timestamp, current_timestamp()) < 5 --refining filter that delivers the subset of data actually required
group by time;