Hive 写入分区和 DEFAULT

Hive writes to partition and DEFAULT

我有日期时间 table、事实 table 和分区 table。目标是将 date_time 连接到事实并插入分区 table。它正在工作,因为我可以验证 /apps/hive/warehouse/dbname.db/p_tbl/p_year=2016/p_month=01/p_day=01 以及其他几个年、月、日文件夹是否存在。然而,我也有 p_year=__HIVE_DEFAULT_PARTITION__/p_month=__HIVE_DEFAULT_PARTITION__/p_day=__HIVE_DEFAULT_PARTITION__ 充满了应该有自己的分区的数据。为什么某些年、月、日分区有效,而对于其他日期,它们被写入 HIVE_DEFAULT?是的,我将 post 重命名为通用名称,而不是真正的 db、tbl、col 名称...

CREATE EXTERNAL TABLE ${DB_NAME_HIVE}.date_time(
date_time_key bigint,
label_yyyy varchar(32),
label_mm varchar(32),
label_dd varchar(32)
)
row format delimited
fields terminated by ','
lines terminated by '\n'
LOCATION '${TARGET_PATH}date_time'
;

CREATE EXTERNAL TABLE ${DB_NAME_HIVE}.p_table(
date_time_key bigint,
column_0 varchar(50),
column_1 char(32),
column_2 timestamp,
column_3 bigint,
column_4 tinyint
)
PARTITIONED BY(p_year string, p_month string, p_day string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '${TARGET_PATH}my_table'
;


USE ${DB_NAME_HIVE};
INSERT INTO TABLE ${DB_NAME_HIVE}.p_tbl PARTITION(p_year, p_month, p_day)
SELECT
mt.date_time_key,
mt.col_0,
mt.col_1,
mt.col_2,
mt.col_3,
mt.col_4,
dt.label_yyyy,
dt.label_mm,
dt.label_dd
FROM my_tbl mt
LEFT OUTER JOIN date_time dt on (mt.date_time_key = dt.date_time_key)
;

INFO  :          Time taken for load dynamic partitions : 12557
INFO  :         Loading partition {p_year=2016, p_month=11, p_day=15}
INFO  :         Loading partition {p_year=2016, p_month=07, p_day=20}
INFO  :         Loading partition {p_year=2014, p_month=08, p_day=06}
INFO  :         Loading partition {p_year=2015, p_month=04, p_day=05}
INFO  :         Loading partition {p_year=2012, p_month=10, p_day=31}
INFO  :         Loading partition {p_year=2014, p_month=04, p_day=16}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=06}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=09}
INFO  :         Loading partition {p_year=2012, p_month=11, p_day=22}
INFO  :         Loading partition {p_year=2014, p_month=11, p_day=19}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=03}
INFO  :         Loading partition {p_year=2013, p_month=12, p_day=24}
INFO  :         Loading partition {p_year=2014, p_month=10, p_day=29}
INFO  :         Loading partition {p_year=2015, p_month=09, p_day=04}
INFO  :         Loading partition {p_year=2015, p_month=05, p_day=11}
INFO  :         Loading partition {p_year=2016, p_month=06, p_day=13}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=24}
INFO  :         Loading partition {p_year=2014, p_month=10, p_day=21}
INFO  :         Loading partition {p_year=2016, p_month=01, p_day=06}
INFO  :         Loading partition {p_year=2014, p_month=11, p_day=05}
INFO  :         Loading partition {p_year=2012, p_month=12, p_day=04}
INFO  :         Loading partition {p_year=2016, p_month=11, p_day=25}
INFO  :         Loading partition {p_year=2014, p_month=10, p_day=13}
INFO  :         Loading partition {p_year=2013, p_month=06, p_day=21}
INFO  :         Loading partition {p_year=2013, p_month=06, p_day=27}
INFO  :         Loading partition {p_year=2014, p_month=10, p_day=22}
INFO  :         Loading partition {p_year=2016, p_month=11, p_day=22}
INFO  :         Loading partition {p_year=2012, p_month=10, p_day=26}
INFO  :         Loading partition {p_year=2014, p_month=08, p_day=28}
INFO  :         Loading partition {p_year=2013, p_month=10, p_day=21}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=04}
INFO  :         Loading partition {p_year=2013, p_month=11, p_day=14}
INFO  :         Loading partition {p_year=2013, p_month=10, p_day=22}
INFO  :         Loading partition {p_year=2014, p_month=08, p_day=26}
INFO  :         Loading partition {p_year=2012, p_month=10, p_day=30}
INFO  :         Loading partition {p_year=2014, p_month=05, p_day=23}
INFO  :         Loading partition {p_year=2012, p_month=10, p_day=29}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=02}
INFO  :         Loading partition {p_year=__HIVE_DEFAULT_PARTITION__, p_month=__HIVE_DEFAULT_PARTITION__, p_day=__HIVE_DEFAULT_PARTITION__}
INFO  :         Loading partition {p_year=2014, p_month=09, p_day=05}
INFO  :         Loading partition {p_year=2016, p_month=07, p_day=25}
INFO  :         Loading partition {p_year=2016, p_month=08, p_day=10}

在 Hive 中使用动态分区,如果存在空值,则将它们写入 Hive 默认分区。因此,为了解决此问题,您可以清理源数据以填充或删除空值。

此问题已由 NOT 使用 DATE_TIME table 和内置 HIVE 函数 YEAR();MONTH();DAY(); 解决针对数据中的 TIMESTAMP 列。这些函数 return 一个 INT 传递给分区。

示例

CREATE EXTERNAL TABLE ${DB_NAME_HIVE}.my_tbl(
unique_id char(32),
some_name varchar(50),
some_group varchar(50),
in_time_start timestamp,
in_time_finish timestamp
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION '${TARGET_PATH}my_tbl';

CREATE EXTERNAL TABLE ${DB_NAME_HIVE}.my_tbl_p(
unique_id char(32),
some_name varchar(50),
some_group varchar(50),
in_time_start timestamp,
in_time_finish timestamp
)
PARTITIONED BY(p_year int, p_month int, p_day int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS ORC
LOCATION '${TARGET_PATH}my_tbl_p'
;

INSERT INTO TABLE ${DB_NAME_HIVE}.my_tbl_p PARTITION(p_year, p_month, p_day)
SELECT
unique_id,
some_name,
some_group,
in_time_start,
in_time_finish,
year(in_time_start) as p_year,
month(in_time_start) as p_month,
day(in_time_start) as p_day
FROM my_tbl
;