MySQL 按最近的日期时间加入 1 对 1
MySQL join 1-to-1 by closest datetime
我需要按大致日期加入两个 table,这样 table old
中的每一行仅与 table 中最接近的日期相匹配new
- 一对一。不允许复制 new
行 - 只匹配一次以获得最小差异。
这里有一些例子可以尝试:
CREATE TABLE `new` (`ID` int(2), `date` datetime, `new` varchar(1));
INSERT INTO `new` (`ID`, `date`, `new`) VALUES
(1, '2016-03-02 12:20:00', 't'),
(1, '2016-03-07 12:20:00', 'u'),
(1, '2016-04-02 12:20:00', 'v'),
(2, '2016-04-12 11:03:00', 'x');
CREATE TABLE `old` (`ID` int(2), `date` datetime, `old` varchar(1));
INSERT INTO `old` (`ID`, `date`, `old`) VALUES
(1, '2016-03-07 12:20:00', 'a'),
(1, '2016-04-02 12:20:00', 'b'),
(1, '2016-03-01 10:09:00', 'c'),
(1, '2015-04-12 10:09:00', 'd'),
(1, '2016-03-03 12:20:00', 'e');
我期望的输出是这样的:
ID old.date old new.date new
1 2016-03-07 12:20:00 a 2016-03-07 12:20:00 u
1 2016-04-02 12:20:00 b 2016-04-02 12:20:00 v
1 2016-03-01 10:09:00 c NULL NULL
1 2015-04-12 10:09:00 d NULL NULL
1 2016-03-03 12:20:00 e 2016-03-02 12:20:00 t
2 NULL NULL 2016-04-12 11:03:00 x
我可以通过以下方式稍微接近这个:
SELECT * FROM old A LEFT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day, A.date, B.date))<2
UNION
SELECT * FROM old A RIGHT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day, A.date, B.date))<2
ORDER BY old
但很明显,这最终会在指定时间 window 内匹配多行,而不仅仅是 最佳 匹配。玩天数对我来说不是解决方案,因为在实践中我有两个巨大的 table 要加入,这需要在一个时间 window 下工作,在这个时间里会有很多行中的多个匹配项.
考虑以下...
DROP TABLE IF EXISTS new;
CREATE TABLE new
(ID INT NOT NULL
,date DATETIME
,new CHAR(1)
,PRIMARY KEY(ID,date)
);
INSERT INTO new VALUES
(1, '2016-03-02 12:20:00', 't'),
(1, '2016-03-07 12:20:00', 'u'),
(1, '2016-04-02 12:20:00', 'v'),
(2, '2016-04-12 11:03:00', 'x')
;
CREATE TABLE old
(ID INT NOT NULL
,date DATETIME
,old CHAR(1)
,PRIMARY KEY(ID,date)
);
INSERT INTO old VALUES
(1, '2016-03-07 12:20:00', 'a'),
(1, '2016-04-02 12:20:00', 'b'),
(1, '2016-03-01 10:09:00', 'c'),
(1, '2015-04-12 10:09:00', 'd'),
(1, '2016-03-03 12:20:00', 'e');
SELECT a.id old_id
, a.date old_date
, a.old
, b.id new_id
, b.date new_date
, b.new
FROM
( SELECT old.*
, MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta
FROM old
JOIN new ON new.id = old.id
GROUP
BY old.id
, old.date
) a
LEFT
JOIN new b
ON b.id = a.id
AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
UNION
SELECT a.id old_id
, a.date old_date
, a.old
, b.id new_id
, b.date new_date
, b.new
FROM
( SELECT old.*
, MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta
FROM old
JOIN new ON new.id = old.id
GROUP
BY old.id
, old.date
) a
RIGHT
JOIN new b
ON b.id = a.id
AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
ORDER
BY old IS NULL, old
;
+--------+---------------------+------+--------+---------------------+------+
| old_id | old_date | old | new_id | new_date | new |
+--------+---------------------+------+--------+---------------------+------+
| 1 | 2016-03-07 12:20:00 | a | 1 | 2016-03-07 12:20:00 | u |
| 1 | 2016-04-02 12:20:00 | b | 1 | 2016-04-02 12:20:00 | v |
| 1 | 2016-03-01 10:09:00 | c | 1 | 2016-03-02 12:20:00 | t |
| 1 | 2015-04-12 10:09:00 | d | 1 | 2016-03-02 12:20:00 | t |
| 1 | 2016-03-03 12:20:00 | e | 1 | 2016-03-02 12:20:00 | t |
| NULL | NULL | NULL | 2 | 2016-04-12 11:03:00 | x |
+--------+---------------------+------+--------+---------------------+------+
对于这个难题的最后一部分 - 删除重复项,我可能会在应用程序代码中处理它。
似乎实现近似 1 对 1 连接的唯一方法是在存储过程中使用 cursor
。
感谢@Strawberry 为我指明了正确的方向 - 您将在下面看到您的代码片段被重用。这是最终对我有用的解决方案。它输出排序不同的记录,但至少它是真正的一对一匹配。
DROP PROCEDURE IF EXISTS amerge;
DELIMITER //
CREATE PROCEDURE amerge()
BEGIN
/* Necessary declarations */
DECLARE o_ID INT DEFAULT 0;
DECLARE o_date VARCHAR(30) DEFAULT 0;
DECLARE o_old VARCHAR(2);
DECLARE o_mdiff FLOAT;
DECLARE ct INT DEFAULT 0;
DECLARE done INT DEFAULT FALSE;
DECLARE cursor1 CURSOR FOR SELECT ID, date, old, mdiff FROM t1;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
/* Temporary tables */
-- copy of 'old' with diff column = min difference
CREATE TEMPORARY TABLE t1
SELECT old.*,MIN(ABS(TIMESTAMPDIFF(hour, old.date, new.date))) AS mdiff
FROM old JOIN new ON old.ID=new.ID
GROUP BY old.ID, old.date
ORDER BY mdiff ASC;
-- cartesian join with abs(diff) column
CREATE TEMPORARY TABLE t2
SELECT old.ID AS ID_1, old.date AS date_1, new.ID as ID_2, new.date AS date_2, old, new,
ABS(TIMESTAMPDIFF(hour, old.date, new.date)) AS diff
FROM old CROSS JOIN new ON old.ID=new.ID
ORDER BY diff ASC;
-- empty table to fill in with the results
CREATE TEMPORARY TABLE t3
(id_1 INT, date_1 DATETIME, id_2 INT, date_2 DATETIME, old VARCHAR(2), new VARCHAR(2), diff FLOAT);
/* Cursor */
OPEN cursor1;
getparams: LOOP
FETCH cursor1 INTO o_ID, o_date, o_old, o_mdiff;
IF done THEN
LEAVE getparams;
END IF;
SELECT COUNT(*) FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff INTO ct;
CASE ct
WHEN 0 THEN
INSERT INTO t3 VALUES (o_ID, o_date, NULL, NULL, o_old, NULL, o_mdiff);
ELSE
INSERT INTO t3 SELECT * FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff LIMIT 1;
END CASE;
DELETE FROM t2 WHERE t2.ID_2=o_ID AND t2.date_2 IN (SELECT date_2 FROM t3 WHERE t3.date_1=o_date);
END LOOP getparams;
CLOSE cursor1;
/* Workaround for error of reopening temp tables in MySQL */
DROP TEMPORARY TABLE t2;
CREATE TEMPORARY TABLE t2
SELECT * FROM t3;
/* Output */
SELECT * FROM t2
UNION
SELECT NULL AS ID_1, NULL AS date_1, new.ID as ID_2, new.date AS date_2, NULL AS old, new.new AS new, NULL AS diff
FROM new LEFT JOIN t3 ON t3.ID_2=new.ID AND t3.date_2 = new.date WHERE t3.ID_2 IS NULL;
END //
DELIMITER ;
CALL amerge();
输出是(使用上例中的数据,主键设置为 ID+日期):
id_1 date_1 id_2 date_2 old new diff
1 2016-03-07 12:20:00 1 2016-03-07 12:20:00 a u 0
1 2016-04-02 12:20:00 1 2016-04-02 12:20:00 b v 0
1 2016-03-03 12:20:00 1 2016-03-02 12:20:00 e t 24
1 2016-03-01 10:09:00 NULL c NULL 26
1 2015-04-12 10:09:00 NULL d NULL 7802
NULL 2 2016-04-12 11:03:00 NULL x NULL
我需要按大致日期加入两个 table,这样 table old
中的每一行仅与 table 中最接近的日期相匹配new
- 一对一。不允许复制 new
行 - 只匹配一次以获得最小差异。
这里有一些例子可以尝试:
CREATE TABLE `new` (`ID` int(2), `date` datetime, `new` varchar(1));
INSERT INTO `new` (`ID`, `date`, `new`) VALUES
(1, '2016-03-02 12:20:00', 't'),
(1, '2016-03-07 12:20:00', 'u'),
(1, '2016-04-02 12:20:00', 'v'),
(2, '2016-04-12 11:03:00', 'x');
CREATE TABLE `old` (`ID` int(2), `date` datetime, `old` varchar(1));
INSERT INTO `old` (`ID`, `date`, `old`) VALUES
(1, '2016-03-07 12:20:00', 'a'),
(1, '2016-04-02 12:20:00', 'b'),
(1, '2016-03-01 10:09:00', 'c'),
(1, '2015-04-12 10:09:00', 'd'),
(1, '2016-03-03 12:20:00', 'e');
我期望的输出是这样的:
ID old.date old new.date new
1 2016-03-07 12:20:00 a 2016-03-07 12:20:00 u
1 2016-04-02 12:20:00 b 2016-04-02 12:20:00 v
1 2016-03-01 10:09:00 c NULL NULL
1 2015-04-12 10:09:00 d NULL NULL
1 2016-03-03 12:20:00 e 2016-03-02 12:20:00 t
2 NULL NULL 2016-04-12 11:03:00 x
我可以通过以下方式稍微接近这个:
SELECT * FROM old A LEFT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day, A.date, B.date))<2
UNION
SELECT * FROM old A RIGHT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day, A.date, B.date))<2
ORDER BY old
但很明显,这最终会在指定时间 window 内匹配多行,而不仅仅是 最佳 匹配。玩天数对我来说不是解决方案,因为在实践中我有两个巨大的 table 要加入,这需要在一个时间 window 下工作,在这个时间里会有很多行中的多个匹配项.
考虑以下...
DROP TABLE IF EXISTS new;
CREATE TABLE new
(ID INT NOT NULL
,date DATETIME
,new CHAR(1)
,PRIMARY KEY(ID,date)
);
INSERT INTO new VALUES
(1, '2016-03-02 12:20:00', 't'),
(1, '2016-03-07 12:20:00', 'u'),
(1, '2016-04-02 12:20:00', 'v'),
(2, '2016-04-12 11:03:00', 'x')
;
CREATE TABLE old
(ID INT NOT NULL
,date DATETIME
,old CHAR(1)
,PRIMARY KEY(ID,date)
);
INSERT INTO old VALUES
(1, '2016-03-07 12:20:00', 'a'),
(1, '2016-04-02 12:20:00', 'b'),
(1, '2016-03-01 10:09:00', 'c'),
(1, '2015-04-12 10:09:00', 'd'),
(1, '2016-03-03 12:20:00', 'e');
SELECT a.id old_id
, a.date old_date
, a.old
, b.id new_id
, b.date new_date
, b.new
FROM
( SELECT old.*
, MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta
FROM old
JOIN new ON new.id = old.id
GROUP
BY old.id
, old.date
) a
LEFT
JOIN new b
ON b.id = a.id
AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
UNION
SELECT a.id old_id
, a.date old_date
, a.old
, b.id new_id
, b.date new_date
, b.new
FROM
( SELECT old.*
, MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta
FROM old
JOIN new ON new.id = old.id
GROUP
BY old.id
, old.date
) a
RIGHT
JOIN new b
ON b.id = a.id
AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
ORDER
BY old IS NULL, old
;
+--------+---------------------+------+--------+---------------------+------+
| old_id | old_date | old | new_id | new_date | new |
+--------+---------------------+------+--------+---------------------+------+
| 1 | 2016-03-07 12:20:00 | a | 1 | 2016-03-07 12:20:00 | u |
| 1 | 2016-04-02 12:20:00 | b | 1 | 2016-04-02 12:20:00 | v |
| 1 | 2016-03-01 10:09:00 | c | 1 | 2016-03-02 12:20:00 | t |
| 1 | 2015-04-12 10:09:00 | d | 1 | 2016-03-02 12:20:00 | t |
| 1 | 2016-03-03 12:20:00 | e | 1 | 2016-03-02 12:20:00 | t |
| NULL | NULL | NULL | 2 | 2016-04-12 11:03:00 | x |
+--------+---------------------+------+--------+---------------------+------+
对于这个难题的最后一部分 - 删除重复项,我可能会在应用程序代码中处理它。
似乎实现近似 1 对 1 连接的唯一方法是在存储过程中使用 cursor
。
感谢@Strawberry 为我指明了正确的方向 - 您将在下面看到您的代码片段被重用。这是最终对我有用的解决方案。它输出排序不同的记录,但至少它是真正的一对一匹配。
DROP PROCEDURE IF EXISTS amerge;
DELIMITER //
CREATE PROCEDURE amerge()
BEGIN
/* Necessary declarations */
DECLARE o_ID INT DEFAULT 0;
DECLARE o_date VARCHAR(30) DEFAULT 0;
DECLARE o_old VARCHAR(2);
DECLARE o_mdiff FLOAT;
DECLARE ct INT DEFAULT 0;
DECLARE done INT DEFAULT FALSE;
DECLARE cursor1 CURSOR FOR SELECT ID, date, old, mdiff FROM t1;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
/* Temporary tables */
-- copy of 'old' with diff column = min difference
CREATE TEMPORARY TABLE t1
SELECT old.*,MIN(ABS(TIMESTAMPDIFF(hour, old.date, new.date))) AS mdiff
FROM old JOIN new ON old.ID=new.ID
GROUP BY old.ID, old.date
ORDER BY mdiff ASC;
-- cartesian join with abs(diff) column
CREATE TEMPORARY TABLE t2
SELECT old.ID AS ID_1, old.date AS date_1, new.ID as ID_2, new.date AS date_2, old, new,
ABS(TIMESTAMPDIFF(hour, old.date, new.date)) AS diff
FROM old CROSS JOIN new ON old.ID=new.ID
ORDER BY diff ASC;
-- empty table to fill in with the results
CREATE TEMPORARY TABLE t3
(id_1 INT, date_1 DATETIME, id_2 INT, date_2 DATETIME, old VARCHAR(2), new VARCHAR(2), diff FLOAT);
/* Cursor */
OPEN cursor1;
getparams: LOOP
FETCH cursor1 INTO o_ID, o_date, o_old, o_mdiff;
IF done THEN
LEAVE getparams;
END IF;
SELECT COUNT(*) FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff INTO ct;
CASE ct
WHEN 0 THEN
INSERT INTO t3 VALUES (o_ID, o_date, NULL, NULL, o_old, NULL, o_mdiff);
ELSE
INSERT INTO t3 SELECT * FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff LIMIT 1;
END CASE;
DELETE FROM t2 WHERE t2.ID_2=o_ID AND t2.date_2 IN (SELECT date_2 FROM t3 WHERE t3.date_1=o_date);
END LOOP getparams;
CLOSE cursor1;
/* Workaround for error of reopening temp tables in MySQL */
DROP TEMPORARY TABLE t2;
CREATE TEMPORARY TABLE t2
SELECT * FROM t3;
/* Output */
SELECT * FROM t2
UNION
SELECT NULL AS ID_1, NULL AS date_1, new.ID as ID_2, new.date AS date_2, NULL AS old, new.new AS new, NULL AS diff
FROM new LEFT JOIN t3 ON t3.ID_2=new.ID AND t3.date_2 = new.date WHERE t3.ID_2 IS NULL;
END //
DELIMITER ;
CALL amerge();
输出是(使用上例中的数据,主键设置为 ID+日期):
id_1 date_1 id_2 date_2 old new diff
1 2016-03-07 12:20:00 1 2016-03-07 12:20:00 a u 0
1 2016-04-02 12:20:00 1 2016-04-02 12:20:00 b v 0
1 2016-03-03 12:20:00 1 2016-03-02 12:20:00 e t 24
1 2016-03-01 10:09:00 NULL c NULL 26
1 2015-04-12 10:09:00 NULL d NULL 7802
NULL 2 2016-04-12 11:03:00 NULL x NULL