如何在配置单元的分区数据中创建 table?
How to create table in partition data in hive?
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/_impala_insert_staging
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI
[mgupta@sjc-dev-binn01 ~]$ hadoop fs -ls /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI
Found 27 items
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201602
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201603
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201604
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201605
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201606
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201607
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201608
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201609
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201610
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201611
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201612
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201701
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201702
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201703
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201704
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201705
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201706
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201707
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201708
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201709
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201710
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201711
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201712
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201801
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201802
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201803
[mgupta@sjc-dev-binn01 ~]$ hadoop fs -ls /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601
Found 3 items
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=0
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=38527
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=__HIVE_DEFAULT_PARTITION__
[mgupta@sjc-dev-binn01 ~]$ hadoop fs -ls /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=0
Found 1 items
-rw-r--r-- 3 mgupta supergroup 2069014 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=0/f9466a0068b906cf-6ace7f8500000049_294515768_data.0.parq
[mgupta@sjc-dev-binn01 ~]$
您可以尝试下面给出的步骤。
方法一
识别架构(列名和类型,包括分区列)
创建分区的配置单元table(确保添加分区列和分隔符信息)
将数据加载到分区的 table。(在这种情况下,加载文件将没有分区列,因为您将通过 hard-code 它通过 load
命令)
create table <table_name> (col1 data_type1, col2 data_type2..)
partitioned by(part_col data_type3)
row format delimited
fields terminated by '<field_delimiter_in_your_data>'
load data inpath '/hdfs/loc/file1' into table <table_name>
partition (<part_col>='201601');
load data inpath '/hdfs/loc/file2' into table <table_name>
partition (<part_col>='201602')
load data inpath '/hdfs/loc/file3' into table <table_name>
partition (<part_col>='201603')
等等。
方法二
创建一个暂存 table(临时 table),其架构与主 table 相同,但没有任何分区
将您的全部数据加载到此 table(确保您将“分区列”作为这些文件中的字段之一)
使用动态分区插入[=64=从分段table将数据加载到主table ].
create table <staging_table> (col1 data_type1, col2 data_type2..)
row format delimited
fields terminated by '<field_delimiter_in_your_data>'
create table <main_table> (col1 data_type1, col2 data_type2..)
partitioned by(part_col data_type3);
load data inpath '/hdfs/loc/directory/' into table <staging_table>;
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
insert into table <main_table>
partition(part_col)
select col1,col2,....part_col from <staging_table>;
方法 2 的关键方面是:
- 使“part_col”作为加载文件中的字段可用
- 在最后的插入语句中,将“part_col”作为 select 子句的最后一个字段。
让我们创建一个 table,在年和月上有一个分区,在 table 中有一个时间戳:
CREATE TABLE `mypart_p`(
`id` bigint,
`open_ts` string
)
PARTITIONED BY (YEAR INT, MONTH INT)
现在我必须改变 table。
ALTER TABLE mypart_p ADD PARTITION (YEAR=2020, MONTH=1)
我每年和每个月都必须这样做,在 python 中循环进行。现在让我们用数据填充它并分配数据属于哪个分区:
INSERT into mypart_p PARTITION (YEAR=2020, MONTH=1)
select id,
open_ts
FROM some_other_table
WHERE substring(open_ts,0,4) = '2020'
AND substring(open_ts,6,2) = '01'
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/_impala_insert_staging
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI
[mgupta@sjc-dev-binn01 ~]$ hadoop fs -ls /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI
Found 27 items
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201602
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201603
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201604
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201605
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201606
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201607
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201608
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201609
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201610
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201611
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201612
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201701
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201702
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201703
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201704
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201705
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201706
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:17 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201707
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201708
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201709
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201710
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201711
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201712
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201801
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201802
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:18 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201803
[mgupta@sjc-dev-binn01 ~]$ hadoop fs -ls /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601
Found 3 items
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=0
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=38527
drwxr-xr-x - mgupta supergroup 0 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=__HIVE_DEFAULT_PARTITION__
[mgupta@sjc-dev-binn01 ~]$ hadoop fs -ls /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=0
Found 1 items
-rw-r--r-- 3 mgupta supergroup 2069014 2018-03-26 22:16 /kylin/retailer/qi_basket_brand_bucket_fact/product_hierarchy_type=CI/month_id=201601/company_sid=0/f9466a0068b906cf-6ace7f8500000049_294515768_data.0.parq
[mgupta@sjc-dev-binn01 ~]$
您可以尝试下面给出的步骤。
方法一
识别架构(列名和类型,包括分区列)
创建分区的配置单元table(确保添加分区列和分隔符信息)
将数据加载到分区的 table。(在这种情况下,加载文件将没有分区列,因为您将通过 hard-code 它通过
load
命令)create table <table_name> (col1 data_type1, col2 data_type2..) partitioned by(part_col data_type3) row format delimited fields terminated by '<field_delimiter_in_your_data>' load data inpath '/hdfs/loc/file1' into table <table_name> partition (<part_col>='201601'); load data inpath '/hdfs/loc/file2' into table <table_name> partition (<part_col>='201602') load data inpath '/hdfs/loc/file3' into table <table_name> partition (<part_col>='201603')
等等。
方法二
创建一个暂存 table(临时 table),其架构与主 table 相同,但没有任何分区
将您的全部数据加载到此 table(确保您将“分区列”作为这些文件中的字段之一)
使用动态分区插入[=64=从分段table将数据加载到主table ].
create table <staging_table> (col1 data_type1, col2 data_type2..) row format delimited fields terminated by '<field_delimiter_in_your_data>' create table <main_table> (col1 data_type1, col2 data_type2..) partitioned by(part_col data_type3); load data inpath '/hdfs/loc/directory/' into table <staging_table>; SET hive.exec.dynamic.partition=true; SET hive.exec.dynamic.partition.mode=nonstrict; insert into table <main_table> partition(part_col) select col1,col2,....part_col from <staging_table>;
方法 2 的关键方面是:
- 使“part_col”作为加载文件中的字段可用
- 在最后的插入语句中,将“part_col”作为 select 子句的最后一个字段。
让我们创建一个 table,在年和月上有一个分区,在 table 中有一个时间戳:
CREATE TABLE `mypart_p`(
`id` bigint,
`open_ts` string
)
PARTITIONED BY (YEAR INT, MONTH INT)
现在我必须改变 table。
ALTER TABLE mypart_p ADD PARTITION (YEAR=2020, MONTH=1)
我每年和每个月都必须这样做,在 python 中循环进行。现在让我们用数据填充它并分配数据属于哪个分区:
INSERT into mypart_p PARTITION (YEAR=2020, MONTH=1)
select id,
open_ts
FROM some_other_table
WHERE substring(open_ts,0,4) = '2020'
AND substring(open_ts,6,2) = '01'