如何在 Athena 中使用日期分区创建视图?
How do I create a VIEW using date partitions in Athena?
我想创建一个始终 returns 最近 2 个 Athena 分区的最近 1 小时数据的视图。
我将以下 Amazon Athena DDL 与名为 'datehour' 的 varchar 类型的分区列一起使用。
CREATE EXTERNAL TABLE mydb.table_foo (
`account_id` string,
`account_email_address` string,
`record_timestamp` timestamp)
PARTITIONED BY (
`datehour` string)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://data-here-9191919191/table_foo'
TBLPROPERTIES (
'projection.datehour.format'='yyyy/MM/dd/HH',
'projection.datehour.interval'='1',
'projection.datehour.interval.unit'='HOURS',
'projection.datehour.range'='2020/11/01/00,NOW',
'projection.datehour.type'='date',
'projection.enabled'='true',
'storage.location.template'='s3://data-here-9191919191/table_foo/${datehour}',
'transient_lastDdlTime'='1604013447')
这是我想要 运行 创建一个始终 returns 来自最近 2 个分区的最近 1 小时数据的视图的查询。
select *
from mydb.table_foo
where
(datehour = CONCAT(
CAST( year(current_timestamp) AS varchar) , '/',
CAST( month(current_timestamp) AS varchar), '/',
CAST( day(current_timestamp) AS varchar), '/',
CAST( hour(current_timestamp) AS varchar))
OR
datehour = CONCAT(
CAST( year(current_timestamp) AS varchar) , '/',
CAST( month(current_timestamp) AS varchar), '/',
CAST( day(current_timestamp) AS varchar), '/',
CAST( ( hour(current_timestamp - interval '1' hour) ) AS varchar)))
AND
record_timestamp BETWEEN (current_timestamp - interval '1' hour) AND current_timestamp
示例SQL 我想制作动态的是这样的:
select *
from table_foo
where datehour = '2020/11/17/23' or datehour = '2020/11/18/00' AND
record_timestamp BETWEEN (current_timestamp - interval '1' hour) AND current_timestamp
动态 WHERE 逻辑会围绕所有字段、月、日和年的变化提出问题。我 运行 在第一天,今天午夜左右解决了这个问题。如何动态生成我的 datehour 分区?
解决方案应该是将 current_timestamp - interval '1' hour
转换为正确格式的 varchar
。假设数据按照 table 定义中的描述划分为 yyyy/MM/dd/HH
:
select *
from mydb.table_foo
WHERE datehour BETWEEN date_format(date_trunc('hour',current_timestamp - interval '1' hour),'%Y/%m/%d/%H') AND date_format(date_trunc('hour',current_timestamp),'%Y/%m/%d/%H')
AND record_timestamp BETWEEN (current_timestamp - interval '1' hour) AND current_timestamp
我想创建一个始终 returns 最近 2 个 Athena 分区的最近 1 小时数据的视图。
我将以下 Amazon Athena DDL 与名为 'datehour' 的 varchar 类型的分区列一起使用。
CREATE EXTERNAL TABLE mydb.table_foo (
`account_id` string,
`account_email_address` string,
`record_timestamp` timestamp)
PARTITIONED BY (
`datehour` string)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://data-here-9191919191/table_foo'
TBLPROPERTIES (
'projection.datehour.format'='yyyy/MM/dd/HH',
'projection.datehour.interval'='1',
'projection.datehour.interval.unit'='HOURS',
'projection.datehour.range'='2020/11/01/00,NOW',
'projection.datehour.type'='date',
'projection.enabled'='true',
'storage.location.template'='s3://data-here-9191919191/table_foo/${datehour}',
'transient_lastDdlTime'='1604013447')
这是我想要 运行 创建一个始终 returns 来自最近 2 个分区的最近 1 小时数据的视图的查询。
select *
from mydb.table_foo
where
(datehour = CONCAT(
CAST( year(current_timestamp) AS varchar) , '/',
CAST( month(current_timestamp) AS varchar), '/',
CAST( day(current_timestamp) AS varchar), '/',
CAST( hour(current_timestamp) AS varchar))
OR
datehour = CONCAT(
CAST( year(current_timestamp) AS varchar) , '/',
CAST( month(current_timestamp) AS varchar), '/',
CAST( day(current_timestamp) AS varchar), '/',
CAST( ( hour(current_timestamp - interval '1' hour) ) AS varchar)))
AND
record_timestamp BETWEEN (current_timestamp - interval '1' hour) AND current_timestamp
示例SQL 我想制作动态的是这样的:
select *
from table_foo
where datehour = '2020/11/17/23' or datehour = '2020/11/18/00' AND
record_timestamp BETWEEN (current_timestamp - interval '1' hour) AND current_timestamp
动态 WHERE 逻辑会围绕所有字段、月、日和年的变化提出问题。我 运行 在第一天,今天午夜左右解决了这个问题。如何动态生成我的 datehour 分区?
解决方案应该是将 current_timestamp - interval '1' hour
转换为正确格式的 varchar
。假设数据按照 table 定义中的描述划分为 yyyy/MM/dd/HH
:
select *
from mydb.table_foo
WHERE datehour BETWEEN date_format(date_trunc('hour',current_timestamp - interval '1' hour),'%Y/%m/%d/%H') AND date_format(date_trunc('hour',current_timestamp),'%Y/%m/%d/%H')
AND record_timestamp BETWEEN (current_timestamp - interval '1' hour) AND current_timestamp