在 hive 1.2.1 中查看子查询
sub queries with view in hive 1.2.1
如何在hive视图中实现子查询。当我知道配置单元查询不允许子查询时,我们只能通过连接或联合来实现。
但是我有一个不同的场景,我不能应用它们中的任何一个。我所有的表都按 updatedate 和 type 列分区。
类型将作为输入参数获取,我必须在查询时获取最大更新。
下面是查询
select ..........
From table1 t1
JOIN table2 t2 ON t1.t1id = t2.t2id
JOIN table3 t3 ON t1.t1id = t3.t3id
JOIN table4 t4 ON t1.t1id = t4.t4id
JOIN table5 t5 ON t1.t1id = t5.t5id
where
AND t1.updatedate IN (select max(updatedate) updatedate from t1 where type = '${hiveconf:inputtype}' )
AND t2.updatedate IN (select max(updatedate) updatedate from t1 where type = '${hiveconf:inputtype}' )
AND t3.updatedate IN (select max(updatedate) updatedate from t3 where type = '${hiveconf:inputtype}' )
AND t4.updatedate IN (select max(updatedate) updatedate from t4 where type = '${hiveconf:inputtype}' )
AND t5.updatedate IN (select max(updatedate) updatedate from t5 where type = '${hiveconf:inputtype}' )
-- ## Query is not working, it throws exception
我试过如下,它的工作,但在这里我应用了 group by on id。
select ..........
From table1 t1
JOIN (select max(updatedate) updatedate, t2id from t2 where type = '${hiveconf:inputtype}' group by t2id) t2 ON t1.t1id = t2.t2id
JOIN (select max(updatedate) updatedate, t3id from t3 where type = '${hiveconf:inputtype}' group by t3id) t3 ON t1.t1id = t3.t3id
JOIN (select max(updatedate) updatedate, t4id from t4 where type = '${hiveconf:inputtype}' group by t4id) t4 ON t1.t1id = t4.t4id
JOIN (select max(updatedate) updatedate, t5id from t5 where type = '${hiveconf:inputtype}' group by t5id) t5 ON t1.t1id = t5.t5id
where
t1.updatedate IN (select max(updatedate) updatedate from t1 where type = '${hiveconf:inputtype}' )
任何以更好的方式实现这一目标的建议?
当然可以。使用 window 函数:
SELECT ..........
FROM table1 t1 JOIN
(SELECT t2.*,
ROW_NUMBER() OVER (PARTITION BY t2.t2id ORDER BY t2.updateddate DESC) as seqnum
FROM table2 t2
) t2
ON t1.t1id = t2.t2id AND t2.seqnum = 1 JOIN
. . .
继续完成其余表格。
如何在hive视图中实现子查询。当我知道配置单元查询不允许子查询时,我们只能通过连接或联合来实现。 但是我有一个不同的场景,我不能应用它们中的任何一个。我所有的表都按 updatedate 和 type 列分区。 类型将作为输入参数获取,我必须在查询时获取最大更新。
下面是查询
select ..........
From table1 t1
JOIN table2 t2 ON t1.t1id = t2.t2id
JOIN table3 t3 ON t1.t1id = t3.t3id
JOIN table4 t4 ON t1.t1id = t4.t4id
JOIN table5 t5 ON t1.t1id = t5.t5id
where
AND t1.updatedate IN (select max(updatedate) updatedate from t1 where type = '${hiveconf:inputtype}' )
AND t2.updatedate IN (select max(updatedate) updatedate from t1 where type = '${hiveconf:inputtype}' )
AND t3.updatedate IN (select max(updatedate) updatedate from t3 where type = '${hiveconf:inputtype}' )
AND t4.updatedate IN (select max(updatedate) updatedate from t4 where type = '${hiveconf:inputtype}' )
AND t5.updatedate IN (select max(updatedate) updatedate from t5 where type = '${hiveconf:inputtype}' )
-- ## Query is not working, it throws exception
我试过如下,它的工作,但在这里我应用了 group by on id。
select ..........
From table1 t1
JOIN (select max(updatedate) updatedate, t2id from t2 where type = '${hiveconf:inputtype}' group by t2id) t2 ON t1.t1id = t2.t2id
JOIN (select max(updatedate) updatedate, t3id from t3 where type = '${hiveconf:inputtype}' group by t3id) t3 ON t1.t1id = t3.t3id
JOIN (select max(updatedate) updatedate, t4id from t4 where type = '${hiveconf:inputtype}' group by t4id) t4 ON t1.t1id = t4.t4id
JOIN (select max(updatedate) updatedate, t5id from t5 where type = '${hiveconf:inputtype}' group by t5id) t5 ON t1.t1id = t5.t5id
where
t1.updatedate IN (select max(updatedate) updatedate from t1 where type = '${hiveconf:inputtype}' )
任何以更好的方式实现这一目标的建议?
当然可以。使用 window 函数:
SELECT ..........
FROM table1 t1 JOIN
(SELECT t2.*,
ROW_NUMBER() OVER (PARTITION BY t2.t2id ORDER BY t2.updateddate DESC) as seqnum
FROM table2 t2
) t2
ON t1.t1id = t2.t2id AND t2.seqnum = 1 JOIN
. . .
继续完成其余表格。