Impala 分区查询 运行 慢
Impala partition queries running slow
所以我试图用 'file' 列对 Impala table 进行分区,该列有 1500 条不同的记录。这意味着 1500 个分区。我首先 运行 这样的查询 return 分区查询:
SELECT DISTINCT
concat('insert into partitioned_table partition (year=',
cast(year as string),', month=',cast(month as string),
') select c1, c2, c3 from raw_data where year=',
cast(year as string),' and month=',cast(month as string),';') AS command
FROM raw_data;
然后我收到了 1500 个关于 运行 的查询。
Here is the screenshot
现在有一个问题:因为每个查询可能需要 3 分钟才能完成。 1500 个查询可能需要几天时间。这是一个很长的时间。为了节省时间,我已经做了一些调整:使用 COMPUTE STATS 获取静态信息,将 table 转换为 Parquet。我的问题是,有没有办法可以加快这个过程?像 Hive 一样最大化执行程序?
您可以使用dynamic partitioning
insert into partitioned_table partition (year,month)
select c1, c2, c3, year, month
from raw_data
演示
create table t (i int) partitioned by (year string,month string);
insert into t partition (year,month) values
( 1,'2015','02')
,( 2,'2017','01')
,( 3,'2016','02')
,( 4,'2013','09')
,( 5,'2015','07')
,( 6,'2012','03')
,( 7,'2012','12')
,( 8,'2017','12')
,( 9,'2015','11')
,(10,'2015','02')
;
select * from t order by year,month,i;
+----+------+-------+
| i | year | month |
+----+------+-------+
| 6 | 2012 | 03 |
| 7 | 2012 | 12 |
| 4 | 2013 | 09 |
| 1 | 2015 | 02 |
| 10 | 2015 | 02 |
| 5 | 2015 | 07 |
| 9 | 2015 | 11 |
| 3 | 2016 | 02 |
| 2 | 2017 | 01 |
| 8 | 2017 | 12 |
+----+------+-------+
hdfs dfs -ls -R /user/hive/warehouse/t;
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2012
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=03
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=03/174c30c4e1edc236-b57504ce4afd76a2_1891304442_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=12
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=12/174c30c4e1edc236-b57504ce4afd76a2_798564417_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2013
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2013/month=09
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2013/month=09/174c30c4e1edc236-b57504ce4afd76a2_432428758_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=02
-rw-r--r-- 1 impala supergroup 5 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=02/174c30c4e1edc236-b57504ce4afd76a2_768620898_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=07
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=07/174c30c4e1edc236-b57504ce4afd76a2_2029099237_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=11
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=11/174c30c4e1edc236-b57504ce4afd76a2_974618320_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2016
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2016/month=02
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2016/month=02/174c30c4e1edc236-b57504ce4afd76a2_502842645_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2017
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=01
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=01/174c30c4e1edc236-b57504ce4afd76a2_2014291428_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=12
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=12/174c30c4e1edc236-b57504ce4afd76a2_1693475255_data.0.
所以我试图用 'file' 列对 Impala table 进行分区,该列有 1500 条不同的记录。这意味着 1500 个分区。我首先 运行 这样的查询 return 分区查询:
SELECT DISTINCT
concat('insert into partitioned_table partition (year=',
cast(year as string),', month=',cast(month as string),
') select c1, c2, c3 from raw_data where year=',
cast(year as string),' and month=',cast(month as string),';') AS command
FROM raw_data;
然后我收到了 1500 个关于 运行 的查询。
Here is the screenshot
现在有一个问题:因为每个查询可能需要 3 分钟才能完成。 1500 个查询可能需要几天时间。这是一个很长的时间。为了节省时间,我已经做了一些调整:使用 COMPUTE STATS 获取静态信息,将 table 转换为 Parquet。我的问题是,有没有办法可以加快这个过程?像 Hive 一样最大化执行程序?
您可以使用dynamic partitioning
insert into partitioned_table partition (year,month)
select c1, c2, c3, year, month
from raw_data
演示
create table t (i int) partitioned by (year string,month string);
insert into t partition (year,month) values
( 1,'2015','02')
,( 2,'2017','01')
,( 3,'2016','02')
,( 4,'2013','09')
,( 5,'2015','07')
,( 6,'2012','03')
,( 7,'2012','12')
,( 8,'2017','12')
,( 9,'2015','11')
,(10,'2015','02')
;
select * from t order by year,month,i;
+----+------+-------+
| i | year | month |
+----+------+-------+
| 6 | 2012 | 03 |
| 7 | 2012 | 12 |
| 4 | 2013 | 09 |
| 1 | 2015 | 02 |
| 10 | 2015 | 02 |
| 5 | 2015 | 07 |
| 9 | 2015 | 11 |
| 3 | 2016 | 02 |
| 2 | 2017 | 01 |
| 8 | 2017 | 12 |
+----+------+-------+
hdfs dfs -ls -R /user/hive/warehouse/t;
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2012
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=03
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=03/174c30c4e1edc236-b57504ce4afd76a2_1891304442_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=12
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2012/month=12/174c30c4e1edc236-b57504ce4afd76a2_798564417_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2013
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2013/month=09
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2013/month=09/174c30c4e1edc236-b57504ce4afd76a2_432428758_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=02
-rw-r--r-- 1 impala supergroup 5 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=02/174c30c4e1edc236-b57504ce4afd76a2_768620898_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=07
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=07/174c30c4e1edc236-b57504ce4afd76a2_2029099237_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=11
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2015/month=11/174c30c4e1edc236-b57504ce4afd76a2_974618320_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2016
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2016/month=02
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2016/month=02/174c30c4e1edc236-b57504ce4afd76a2_502842645_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2017
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=01
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=01/174c30c4e1edc236-b57504ce4afd76a2_2014291428_data.0.
drwxr-xr-x - impala supergroup 0 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=12
-rw-r--r-- 1 impala supergroup 2 2017-02-07 13:45 /user/hive/warehouse/t/year=2017/month=12/174c30c4e1edc236-b57504ce4afd76a2_1693475255_data.0.