需要将查询从 Redshift 转换为 Presto
Need to convert a query from Redshift to Presto
给出以下用 AWS Redhift 编写的查询:
SELECT session_date,'min' as stats,mini as value,product,endpoint
from
(select
distinct trunc(joinstart_ev_timestamp) as session_date,
PERCENTILE_DISC(0.02) WITHIN GROUP (ORDER BY join_time) over(partition by
trunc(joinstart_ev_timestamp))/1000 as mini,
PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY join_time) OVER (partition by
trunc(joinstart_ev_timestamp))/1000 as first_quartile,
median(join_time) over(partition by trunc(joinstart_ev_timestamp))/1000 as
jt,
PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY join_time) OVER (partition by
trunc(joinstart_ev_timestamp))/1000 as third_quartile,
PERCENTILE_DISC(0.98) WITHIN GROUP (ORDER BY join_time) over(partition by
trunc(joinstart_ev_timestamp))/1000 as maxi,
product_name as product,
endpoint as endpoint
from qe_datawarehouse.join_session_fact
where
trunc(joinstart_ev_timestamp) between '2018-01-18' and '2018-01-30'
and lower(product_name) LIKE 'gotowebinar%'
and join_time>0 and join_time <= 600000 and join_time is not null
and audio_connect_time >= 0
and (entrypoint_access_time >= 0 or entrypoint_access_time is null)
and (panel_connect_time >= 0 or panel_connect_time is null) and version =
'V2');
我需要将其转换成相应的 Presto 查询。
我试过以下:
- 当我在 Presto 中的 Hive 查询上方 运行 时,我收到错误消息:
" 查询失败 (#20180212_044343_00014_jb834): 行 5:36: 在 '(' 处缺少 'BY' "
- 我知道我必须以某种方式在 presto 中使用 "approx_percentile()" 但实际上无法使用它。
注意:
在 Redshift 查询中,每一列都被视为字符串,但在 Presto 中,数据类型如下所示:
create external table if not exists join_session_fact (
join_session_fact_id string
,session_tracking_id string
,user_id string
,participant_id string
,meeting_id string
,session_mcs_id string
,browser_name string
,browser_version string
,endpoint string
,entrypoint string
,build_number string
,model_id string
,model_name string
,hardware_net string
,ip_address string
,country string
,region string
,city string
,os_type string
,os_architecture string
,os_locale string
,os_timezone string
,product_name string
,product_version string
,product_tier string
,participant_role string
,timezone string
,joinstart_ev_timestamp timestamp
,joinLaunch_ev_timestamp timestamp
,joinSession_ev_timestamp timestamp
,joinTime_ev_timestamp timestamp
,audioConnect_ev_timestamp timestamp
,connection_type string
,download_start_timestamp timestamp
,download_end_timestamp timestamp
,install_start_timestamp timestamp
,install_end_timestamp timestamp
,password_start_timestamp timestamp
,password_end_timestamp timestamp
,login_start_timestamp timestamp
,login_end_timestamp timestamp
,audioWait_start_timestamp timestamp
,audioWait_end_timestamp timestamp
,hallway_start_timestamp timestamp
,hallway_end_timestamp timestamp
,entrypoint_access_time double
,endpoint_access_time double
,panel_connect_time double
,audio_connect_time double
,install_time_endpoint double
,download_time_endpoint double
,install_time_launcher double
,download_time_launcher double
,join_time double
,process_data_timestamp timestamp
,source_date timestamp
,version string
,event_date timestamp
)
PARTITIONED BY (data_input_date string)
stored as orc
location '${hiveconf:s3bucket}/${hiveconf:fact_path}/${hiveconf:join_session_fact}/'
TBLPROPERTIES (“orc.compress”=“snappy”);
请注意,当我在 PRESTO 中 运行 低于查询时,它工作正常:
select
distinct cast(joinstart_ev_timestamp as date) as session_date,
approx_percentile(cast(join_time as double),0.50) over (partition by
cast(joinstart_ev_timestamp as date)) /1000 as jt,
product_name as product,
endpoint as endpoint
from datawarehouse.join_session_fact
where
cast(joinstart_ev_timestamp as date) between date '2018-01-18' and date '2018-01-30'
and lower(product_name) LIKE 'gotowebinar%'
and join_time > 0 and join_time <= 600000 and join_time is not null
and audio_connect_time >= 0
and (entrypoint_access_time >= 0 or entrypoint_access_time is null)
and (panel_connect_time >= 0 or panel_connect_time is null) and version = 'V2'
可能是WITHIN GROUP。 AFAIK,不支持那些百分位函数。该错误可能是由于语法无法识别 WITHIN GROUP() 子句造成的。
我找到了到 Presto 的正确转换:
SELECT session_date,'min' as stats,mini as value,product,endpoint
from
(select
distinct cast(joinstart_ev_timestamp as date) as session_date,
approx_percentile(cast(join_time as double),0.02) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as mini,
approx_percentile(cast(join_time as double),0.25) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as first_quartile,
approx_percentile(cast(join_time as double),0.50) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as jt,
approx_percentile(cast(join_time as double),0.75) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as third_quartile,
approx_percentile(cast(join_time as double),0.98) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as maxi,
product_name as product,
endpoint as endpoint
from datawarehouse.join_session_fact
where
cast(joinstart_ev_timestamp as date) between date_add('day', -16, now())
and date_add('day', -1, now())
and lower(product_name) LIKE 'gotowebinar%'
and join_time>0 and join_time <= 600000 and join_time is not null
and audio_connect_time >= 0
and (entrypoint_access_time >= 0 or entrypoint_access_time is null)
and (panel_connect_time >= 0 or panel_connect_time is null) and version = 'V2')
给出以下用 AWS Redhift 编写的查询:
SELECT session_date,'min' as stats,mini as value,product,endpoint
from
(select
distinct trunc(joinstart_ev_timestamp) as session_date,
PERCENTILE_DISC(0.02) WITHIN GROUP (ORDER BY join_time) over(partition by
trunc(joinstart_ev_timestamp))/1000 as mini,
PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY join_time) OVER (partition by
trunc(joinstart_ev_timestamp))/1000 as first_quartile,
median(join_time) over(partition by trunc(joinstart_ev_timestamp))/1000 as
jt,
PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY join_time) OVER (partition by
trunc(joinstart_ev_timestamp))/1000 as third_quartile,
PERCENTILE_DISC(0.98) WITHIN GROUP (ORDER BY join_time) over(partition by
trunc(joinstart_ev_timestamp))/1000 as maxi,
product_name as product,
endpoint as endpoint
from qe_datawarehouse.join_session_fact
where
trunc(joinstart_ev_timestamp) between '2018-01-18' and '2018-01-30'
and lower(product_name) LIKE 'gotowebinar%'
and join_time>0 and join_time <= 600000 and join_time is not null
and audio_connect_time >= 0
and (entrypoint_access_time >= 0 or entrypoint_access_time is null)
and (panel_connect_time >= 0 or panel_connect_time is null) and version =
'V2');
我需要将其转换成相应的 Presto 查询。
我试过以下:
- 当我在 Presto 中的 Hive 查询上方 运行 时,我收到错误消息: " 查询失败 (#20180212_044343_00014_jb834): 行 5:36: 在 '(' 处缺少 'BY' "
- 我知道我必须以某种方式在 presto 中使用 "approx_percentile()" 但实际上无法使用它。
注意: 在 Redshift 查询中,每一列都被视为字符串,但在 Presto 中,数据类型如下所示:
create external table if not exists join_session_fact (
join_session_fact_id string
,session_tracking_id string
,user_id string
,participant_id string
,meeting_id string
,session_mcs_id string
,browser_name string
,browser_version string
,endpoint string
,entrypoint string
,build_number string
,model_id string
,model_name string
,hardware_net string
,ip_address string
,country string
,region string
,city string
,os_type string
,os_architecture string
,os_locale string
,os_timezone string
,product_name string
,product_version string
,product_tier string
,participant_role string
,timezone string
,joinstart_ev_timestamp timestamp
,joinLaunch_ev_timestamp timestamp
,joinSession_ev_timestamp timestamp
,joinTime_ev_timestamp timestamp
,audioConnect_ev_timestamp timestamp
,connection_type string
,download_start_timestamp timestamp
,download_end_timestamp timestamp
,install_start_timestamp timestamp
,install_end_timestamp timestamp
,password_start_timestamp timestamp
,password_end_timestamp timestamp
,login_start_timestamp timestamp
,login_end_timestamp timestamp
,audioWait_start_timestamp timestamp
,audioWait_end_timestamp timestamp
,hallway_start_timestamp timestamp
,hallway_end_timestamp timestamp
,entrypoint_access_time double
,endpoint_access_time double
,panel_connect_time double
,audio_connect_time double
,install_time_endpoint double
,download_time_endpoint double
,install_time_launcher double
,download_time_launcher double
,join_time double
,process_data_timestamp timestamp
,source_date timestamp
,version string
,event_date timestamp
)
PARTITIONED BY (data_input_date string)
stored as orc
location '${hiveconf:s3bucket}/${hiveconf:fact_path}/${hiveconf:join_session_fact}/'
TBLPROPERTIES (“orc.compress”=“snappy”);
请注意,当我在 PRESTO 中 运行 低于查询时,它工作正常:
select
distinct cast(joinstart_ev_timestamp as date) as session_date,
approx_percentile(cast(join_time as double),0.50) over (partition by
cast(joinstart_ev_timestamp as date)) /1000 as jt,
product_name as product,
endpoint as endpoint
from datawarehouse.join_session_fact
where
cast(joinstart_ev_timestamp as date) between date '2018-01-18' and date '2018-01-30'
and lower(product_name) LIKE 'gotowebinar%'
and join_time > 0 and join_time <= 600000 and join_time is not null
and audio_connect_time >= 0
and (entrypoint_access_time >= 0 or entrypoint_access_time is null)
and (panel_connect_time >= 0 or panel_connect_time is null) and version = 'V2'
可能是WITHIN GROUP。 AFAIK,不支持那些百分位函数。该错误可能是由于语法无法识别 WITHIN GROUP() 子句造成的。
我找到了到 Presto 的正确转换:
SELECT session_date,'min' as stats,mini as value,product,endpoint
from
(select
distinct cast(joinstart_ev_timestamp as date) as session_date,
approx_percentile(cast(join_time as double),0.02) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as mini,
approx_percentile(cast(join_time as double),0.25) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as first_quartile,
approx_percentile(cast(join_time as double),0.50) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as jt,
approx_percentile(cast(join_time as double),0.75) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as third_quartile,
approx_percentile(cast(join_time as double),0.98) over (partition by
cast(joinstart_ev_timestamp as date))/1000 as maxi,
product_name as product,
endpoint as endpoint
from datawarehouse.join_session_fact
where
cast(joinstart_ev_timestamp as date) between date_add('day', -16, now())
and date_add('day', -1, now())
and lower(product_name) LIKE 'gotowebinar%'
and join_time>0 and join_time <= 600000 and join_time is not null
and audio_connect_time >= 0
and (entrypoint_access_time >= 0 or entrypoint_access_time is null)
and (panel_connect_time >= 0 or panel_connect_time is null) and version = 'V2')