从 Informix 到 HiveQL

Informix to HiveQL

我有一个为 Informix

编写的 SQL 查询
SELECT cols
FROM table1 t1, outer(table2 t2, table3 t3)
WHERE t1.id = t2.id and t2.type = t3.type

我们在 Hive 上有相同的表和数据,我想将其转换为 HiveQL

使用 LEFT JOIN 而不是 OUTER

SELECT cols
FROM table1 t1
     LEFT JOIN 
         (select t2.id as join_key, other_cols 
            from table2 t2
                 inner join table3 t3 ON t2.type = t3.type
         ) s ON s.join_key=t1.id

Informix 风格的外部连接非常有趣、独特,而且完全不符合标准。 Informix 对 ANSI (ISO) 标准的实现 SQL 外部连接应该是完全无聊的,并且与其他 DBMS 相同。

这是一个 SQL 脚本,它创建并填充 3 个表并对这些表执行 5 个不同的查询。

CREATE TABLE table1
(
    id      SERIAL NOT NULL PRIMARY KEY,
    data    VARCHAR(32) NOT NULL
);

CREATE TABLE table2
(
    id      INTEGER NOT NULL,
    type    INTEGER NOT NULL,
    info    VARCHAR(32) NOT NULL,
    PRIMARY KEY(id, type)
);

CREATE TABLE table3
(
    type    INTEGER NOT NULL PRIMARY KEY,
    name    VARCHAR(32) NOT NULL
);

INSERT INTO table1 VALUES(100, 'Table 1 - ID 100');
INSERT INTO table1 VALUES(101, 'Table 1 - ID 101');
INSERT INTO table1 VALUES(102, 'Table 1 - ID 102');
INSERT INTO table1 VALUES(103, 'Table 1 - ID 103');
INSERT INTO table1 VALUES(104, 'Table 1 - ID 104');

INSERT INTO table2 VALUES(100, 300, 'Table 2 - ID 100, Type 300');
INSERT INTO table2 VALUES(100, 301, 'Table 2 - ID 100, Type 301');
INSERT INTO table2 VALUES(100, 302, 'Table 2 - ID 100, Type 302');
INSERT INTO table2 VALUES(101, 301, 'Table 2 - ID 101, Type 301');
INSERT INTO table2 VALUES(101, 400, 'Table 2 - ID 101, Type 400');
INSERT INTO table2 VALUES(101, 302, 'Table 2 - ID 101, Type 302');
INSERT INTO table2 VALUES(103, 302, 'Table 2 - ID 103, Type 302');
INSERT INTO table2 VALUES(103, 303, 'Table 2 - ID 103, Type 303');
INSERT INTO table2 VALUES(103, 300, 'Table 2 - ID 103, Type 300');
INSERT INTO table2 VALUES(107, 300, 'Table 2 - ID 107, Type 300');
INSERT INTO table2 VALUES(107, 400, 'Table 2 - ID 107, Type 400');

INSERT INTO table3 VALUES(300, 'Table 3 - Type 300');
INSERT INTO table3 VALUES(301, 'Table 3 - Type 301');
INSERT INTO table3 VALUES(302, 'Table 3 - Type 302');
INSERT INTO table3 VALUES(303, 'Table 3 - Type 303');
INSERT INTO table3 VALUES(304, 'Table 3 - Type 304');
INSERT INTO table3 VALUES(305, 'Table 3 - Type 305');
SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1, OUTER(table2 t2, table3 t3)
 WHERE t1.id = t2.id AND t2.type = t3.type;

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1, OUTER(table2 t2, OUTER table3 t3)
 WHERE t1.id = t2.id AND t2.type = t3.type;

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1
  LEFT JOIN table2 t2 ON t1.id = t2.id
  LEFT JOIN table3 t3 ON t2.type = t3.type;

SELECT t1.id as t1_id, t1.data, s.join_key as t2_id, s.t2_type, s.info, s.t3_type, s.name
  FROM table1 t1
  LEFT JOIN 
       (SELECT t2.id AS join_key, t2.info, t3.name, t2.type as t2_type, t3.type as t3_type
          FROM table2 t2
          JOIN table3 t3 ON t2.type = t3.type
       ) s ON s.join_key = t1.id;

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1
  LEFT JOIN table2 t2 ON t1.id = t2.id
  JOIN table3 t3 ON t2.type = t3.type;

第一个查询是问题中的查询,指定了选定的列名。列名列表在每个查询中都是相同的。

第二个查询是一个变体,使用了额外的 OUTER 关键字。它对应于第三个查询,即由@leftjoin 创建的first answer created by leftjoin. The third query is the first query produced by @leftjoin. The fourth query is the 'current answer'(严格来说,第三次修订)。第五个查询是第四个查询的变体——它产生不同的答案,因此不等价。

这些是查询的输出。

查询 1

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1, OUTER(table2 t2, table3 t3)
 WHERE t1.id = t2.id AND t2.type = t3.type;
t1_id data t2_id t2_type info t3_type name
100 Table 1 - ID 100 100 300 Table 2 - ID 100, Type 300 300 Table 3 - Type 300
100 Table 1 - ID 100 100 301 Table 2 - ID 100, Type 301 301 Table 3 - Type 301
100 Table 1 - ID 100 100 302 Table 2 - ID 100, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 301 Table 2 - ID 101, Type 301 301 Table 3 - Type 301
101 Table 1 - ID 101 101 302 Table 2 - ID 101, Type 302 302 Table 3 - Type 302
102 Table 1 - ID 102
103 Table 1 - ID 103 103 300 Table 2 - ID 103, Type 300 300 Table 3 - Type 300
103 Table 1 - ID 103 103 302 Table 2 - ID 103, Type 302 302 Table 3 - Type 302
103 Table 1 - ID 103 103 303 Table 2 - ID 103, Type 303 303 Table 3 - Type 303
104 Table 1 - ID 104

查询 2

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1, OUTER(table2 t2, OUTER table3 t3)
 WHERE t1.id = t2.id AND t2.type = t3.type;
t1_id data t2_id t2_type info t3_type name
100 Table 1 - ID 100 100 300 Table 2 - ID 100, Type 300 300 Table 3 - Type 300
100 Table 1 - ID 100 100 301 Table 2 - ID 100, Type 301 301 Table 3 - Type 301
100 Table 1 - ID 100 100 302 Table 2 - ID 100, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 301 Table 2 - ID 101, Type 301 301 Table 3 - Type 301
101 Table 1 - ID 101 101 302 Table 2 - ID 101, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 400 Table 2 - ID 101, Type 400
102 Table 1 - ID 102
103 Table 1 - ID 103 103 300 Table 2 - ID 103, Type 300 300 Table 3 - Type 300
103 Table 1 - ID 103 103 302 Table 2 - ID 103, Type 302 302 Table 3 - Type 302
103 Table 1 - ID 103 103 303 Table 2 - ID 103, Type 303 303 Table 3 - Type 303
104 Table 1 - ID 104

查询 3

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1
  LEFT JOIN table2 t2 ON t1.id = t2.id
  LEFT JOIN table3 t3 ON t2.type = t3.type;
t1_id data t2_id t2_type info t3_type name
100 Table 1 - ID 100 100 300 Table 2 - ID 100, Type 300 300 Table 3 - Type 300
100 Table 1 - ID 100 100 301 Table 2 - ID 100, Type 301 301 Table 3 - Type 301
100 Table 1 - ID 100 100 302 Table 2 - ID 100, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 301 Table 2 - ID 101, Type 301 301 Table 3 - Type 301
101 Table 1 - ID 101 101 302 Table 2 - ID 101, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 400 Table 2 - ID 101, Type 400
102 Table 1 - ID 102
103 Table 1 - ID 103 103 300 Table 2 - ID 103, Type 300 300 Table 3 - Type 300
103 Table 1 - ID 103 103 302 Table 2 - ID 103, Type 302 302 Table 3 - Type 302
103 Table 1 - ID 103 103 303 Table 2 - ID 103, Type 303 303 Table 3 - Type 303
104 Table 1 - ID 104

查询 4

SELECT t1.id as t1_id, t1.data, s.join_key as t2_id, s.t2_type, s.info, s.t3_type, s.name
  FROM table1 t1
  LEFT JOIN 
       (SELECT t2.id AS join_key, t2.info, t3.name, t2.type as t2_type, t3.type as t3_type
          FROM table2 t2
          JOIN table3 t3 ON t2.type = t3.type
       ) s ON s.join_key = t1.id;
t1_id data t2_id t2_type info t3_type name
100 Table 1 - ID 100 100 300 Table 2 - ID 100, Type 300 300 Table 3 - Type 300
100 Table 1 - ID 100 100 301 Table 2 - ID 100, Type 301 301 Table 3 - Type 301
100 Table 1 - ID 100 100 302 Table 2 - ID 100, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 301 Table 2 - ID 101, Type 301 301 Table 3 - Type 301
101 Table 1 - ID 101 101 302 Table 2 - ID 101, Type 302 302 Table 3 - Type 302
102 Table 1 - ID 102
103 Table 1 - ID 103 103 300 Table 2 - ID 103, Type 300 300 Table 3 - Type 300
103 Table 1 - ID 103 103 302 Table 2 - ID 103, Type 302 302 Table 3 - Type 302
103 Table 1 - ID 103 103 303 Table 2 - ID 103, Type 303 303 Table 3 - Type 303
104 Table 1 - ID 104

查询 5

SELECT t1.id as t1_id, t1.data, t2.id as t2_id, t2.type as t2_type, t2.info, t3.type as t3_type, t3.name
  FROM table1 t1
  LEFT JOIN table2 t2 ON t1.id = t2.id
  JOIN table3 t3 ON t2.type = t3.type;
t1_id data t2_id t2_type info t3_type name
100 Table 1 - ID 100 100 300 Table 2 - ID 100, Type 300 300 Table 3 - Type 300
103 Table 1 - ID 103 103 300 Table 2 - ID 103, Type 300 300 Table 3 - Type 300
100 Table 1 - ID 100 100 301 Table 2 - ID 100, Type 301 301 Table 3 - Type 301
101 Table 1 - ID 101 101 301 Table 2 - ID 101, Type 301 301 Table 3 - Type 301
100 Table 1 - ID 100 100 302 Table 2 - ID 100, Type 302 302 Table 3 - Type 302
101 Table 1 - ID 101 101 302 Table 2 - ID 101, Type 302 302 Table 3 - Type 302
103 Table 1 - ID 103 103 302 Table 2 - ID 103, Type 302 302 Table 3 - Type 302
103 Table 1 - ID 103 103 303 Table 2 - ID 103, Type 303 303 Table 3 - Type 303

除非我遗漏了什么,Q1 和 Q4 的输出是相同的(所以@leftjoin 的回答是正确的);其他查询的输出都彼此不同(以及 Q1 和 Q4)。