查询集封面
Query for Set Cover
美好的一天,我想为 Set Cover Problem 实现一个 T-SQL 查询,但一直无法在 SQL 中找到有关如何执行此操作的任何提示。
在我的例子中,我的 table 只有两列(IDnumber
和 Mut
),我想找到 IDNumber
的最小数量以获得其中之一每个Mut
。我 真的 想为每个 Mut
获得三个 IDnumbers
,但我想我最好从一个开始,因为那样可能更容易。
DECLARE @myTable TABLE (
IDnumber int,
Mut varchar(1))
INSERT @myTable VALUES
(1,'C'), (1,'N'), (1,'Z'), (1,'M'), (1,'E'), (2,'E'), (3,'B'), (3,'N'), (3,'D'), (3,'K'),
(3,'W'), (4,'O'), (4,'G'), (4,'N'), (4,'B'), (4,'U'), (4,'C'), (5,'Q'), (5,'H'), (6,'K'),
(6,'Y'), (6,'M'), (6,'A'), (6,'O'), (6,'U'), (6,'J'), (7,'H'), (7,'U'), (7,'M'), (7,'L'),
(8,'B'), (8,'K'), (8,'P'), (9,'Y'), (9,'K'), (10,'Z'), (11,'R'), (12,'X'), (12,'R'),
(12,'O'), (12,'Z'), (4,'C'), (1,'Z'), (4,'S'), (6,'E'), (5,'G'), (4,'C'), (4,'S'), (4,'H'),
(6,'D'), (7,'W'), (3,'U'), (6,'N'), (7,'Y'), (6,'N'), (6,'F'), (4,'C'), (4,'I'), (7,'P'),
(10,'H'), (10,'Z'), (10,'S'), (7,'Z'), (6,'B'), (7,'Z'), (8,'X'), (8,'J'), (8,'P'), (10,'K'),
(8,'K'), (12,'P'), (8,'W'), (10,'M'), (12,'F'), (9,'T'), (9,'D'), (14,'Y'), (12,'P'),
(14,'J'), (13,'D'), (15,'H'), (12,'J'), (6,'H'), (2,'Z'), (8,'G'), (10,'Q'), (6,'D'),
(5,'X'), (9,'T'), (6,'W'), (6,'K'), (10,'W'), (7,'J'), (11,'W'), (12,'V'), (9,'F'), (7,'F'),
(4,'M'), (5,'K'), (12,'G'), (12,'T'), (15,'T'), (13,'W'), (7,'J'), (9,'T'), (10,'U'), (9,'S'),
(10,'L'), (10,'J'), (10,'H'), (11,'H'), (12,'S'), (12,'A'), (14,'L'), (13,'K'), (13,'D'),
(4,'M'), (3,'N'), (4,'F'), (7,'M'), (7,'V'), (5,'R'), (4,'K'), (5,'F'), (7,'G'), (8,'M'),
(4,'X'), (7,'F'), (9,'S'), (7,'N'), (6,'W'), (6,'W'), (5,'S'), (9,'Z'), (10,'I'), (11,'Y'),
(11,'D'), (9,'X'), (7,'G'), (9,'S'), (9,'H'), (9,'T'), (8,'J'), (10,'U'), (9,'F'), (9,'S'),
(7,'D'), (14,'R'), (10,'F'), (7,'E'), (15,'M'), (12,'F'), (5,'C'), (8,'E'), (16,'G'), (11,'V'),
(10,'I'), (12,'I'), (11,'Y'), (12,'I'), (14,'J'), (15,'D'), (19,'J'), (16,'B'), (12,'G'),
(9,'J'), (18,'J'), (18,'C'), (16,'Q'), (18,'P'), (13,'F'), (19,'T'), (15,'J'), (15,'R'),
(15,'Q'), (15,'O'), (11,'A'), (24,'B'), (19,'S'), (22,'I'), (15,'X'), (20,'T'), (15,'E'),
(9,'V'), (8,'H'), (16,'N'), (17,'H')
-- Since the above list was generated by a bunch of random numbers/letters I need to
-- delete the duplicates
;WITH cte AS (
SELECT IDnumber, mut,
row_number() OVER(PARTITION BY IDNumber, mut ORDER BY IDNumber) AS [rn]
FROM @myTable
)
DELETE cte WHERE [rn] > 1
SELECT *
FROM ( SELECT IDnumber, Mut FROM @myTable) AS S
PIVOT
(COUNT(Mut) FOR mut IN ([A],[B],[C],[D],[E],[F],[G],[H],[I],[J],[K],[L],[M],[N],[O],[P],
[Q],[R],[S],[T],[U],[V],[W],[X],[Y],[Z])) AS pvt
所以你可以从枢轴 table 看出最小 IDnumbers
将是 3,5,7 和 12。
如何实现该算法?在我看来,我可以找到所有将成为集合的组合 (2^6),然后确定哪些集合具有所有 Muts。 IDnumbers 最少的集合就是最小集合。
那种蛮力可能会奏效,但效率会非常低。我的真实案例并不大,我有 43 个独特的 Muts
(不是示例中的九个)和 ~2000 IDnumbers
,但我认为这需要一些时间才能 运行 因为2^2000 相当大...
谢谢!
我想要一个更大的数据集来测试,但这至少与给定数据集的结果相匹配...
DECLARE @myTable TABLE (
IDnumber INT,
Mut VARCHAR(1))
DECLARE @results TABLE (
IDnumber INT)
INSERT @myTable VALUES
(1,'A'),(1,'B'),(1,'C'),(1,'D'),
(2,'A'),(2,'C'),(2,'D'),
(3,'A'),(3,'B'),(3,'F'),(3,'Z'),
(4,'A'),(4,'B'),(4,'E'),(4,'F'),
(5,'Y'),
(6,'X'),(6,'Z')
DECLARE @IDnumber INT
WHILE EXISTS (SELECT 1 FROM @myTable)
BEGIN
WITH MutRank AS
(
-- Find how many IDNumbers are associated with each mut
SELECT Mut,
COUNT(DISTINCT IDnumber) AS IDnumberCnt
FROM @myTable
GROUP BY Mut
), MutRarity AS
(
-- Translate the IDNumberCnt into a weighted rarity so dupes at
-- a IDNumberCnt level reduce their rarity compared to the next level
SELECT Mut,
RANK() OVER (ORDER BY IDnumberCnt DESC) AS MutRarity
FROM MutRank
)
-- Grab the IDNumber that is associated to the most "rare" muts
SELECT @IDnumber = n.IDnumber
FROM (SELECT TOP 1 m.IDnumber,
SUM(MutRarity) AS IDNumberValue
FROM @myTable m
JOIN MutRarity mr
ON m.Mut = mr.Mut
GROUP BY m.IDnumber
ORDER BY IDNumberValue DESC, IDnumber) n
-- Save the number in our results
INSERT @results (IDnumber)
SELECT @IDnumber
-- Purge all records for the "covered" muts and the selected IDNumber
DELETE m2
FROM @myTable m1
JOIN @myTable m2
ON m1.Mut = m2.Mut
AND m1.IDnumber = @IDnumber
END
SELECT *
FROM @results
ORDER BY IDnumber
这是一种为每个 IDNumber
计算 Mut
值的位掩码的方法,然后使用递归 CTE 生成完成集合的所有可能组合。代码输出给出最终结果的所有可能的IdNumber
组合,不包括一个或多个IdNumber
在组合中冗余的组合(例如示例数据集中的1,2,3,4,5,6
)。
要将其转换为与您的真实数据一起使用,主要区别在于您几乎肯定需要找到另一种机制来将位值分配给 Mut
值。 (我可以在这里作弊,通过操纵每个 Mut
字母 - POWER(2,ASCII(Mut) - 64)
的十进制 ASCII 码来计算位值)。
DECLARE @myTable TABLE (
IDnumber int,
Mut varchar(1))
INSERT @myTable VALUES
(1,'A'),(1,'B'),(1,'C'),(1,'D'),
(2,'A'),(2,'C'),(2,'D'),
(3,'A'),(3,'B'),(3,'F'),(3,'Z'),
(4,'A'),(4,'B'),(4,'E'),(4,'F'),
(5,'Y'),
(6,'X'),(6,'Z')
-- calculate the target bitmask
DECLARE @target bigint = ( SELECT SUM(POWER(2,ASCII(Mut) - 64))
FROM (SELECT DISTINCT Mut FROM @myTable) AS x
)
;WITH baseCTE
AS
(
--calculate a starting bitmask for each ID
SELECT IDnumber, sum(mutbit) AS bitmask
FROM (SELECT DISTINCT IDnumber, CAST(POWER(2,ASCII(Mut) - 64) AS bigint) AS mutbit FROM @myTable) AS x
GROUP BY IDnumber
)
,recCTE
AS
(
SELECT IDnumber, bitmask, CAST(IdNumber AS varchar(max)) AS trail, 1 AS lev
FROM baseCTE
UNION ALL
SELECT b.IDnumber, b.bitmask | r.bitmask, CAST(CONCAT(r.trail,',',b.IDnumber) AS varchar(max)), r.lev + 1
FROM recCTE AS r
JOIN baseCTE AS b
ON b.IDnumber > r.IDnumber
WHERE b.bitmask | r.bitmask <> r.bitmask --ignore combinations which don't add any new Mut values
)
SELECT trail, lev
FROM recCTE
WHERE bitmask = @target
ORDER BY lev
注意。 SQL 服务器按位运算符仅适用于一个或另一个值是整数类型的情况,因此此解决方案仅限于 64 个不同的 Mut
值(其中掩码是 bigint
)- 以获取要超出此范围,您必须使用自定义的按位比较器(可能使用 CLR)。但是,由于问题提到您有 43 个 Mut
值,它现在可能对您有用。
美好的一天,我想为 Set Cover Problem 实现一个 T-SQL 查询,但一直无法在 SQL 中找到有关如何执行此操作的任何提示。
在我的例子中,我的 table 只有两列(IDnumber
和 Mut
),我想找到 IDNumber
的最小数量以获得其中之一每个Mut
。我 真的 想为每个 Mut
获得三个 IDnumbers
,但我想我最好从一个开始,因为那样可能更容易。
DECLARE @myTable TABLE (
IDnumber int,
Mut varchar(1))
INSERT @myTable VALUES
(1,'C'), (1,'N'), (1,'Z'), (1,'M'), (1,'E'), (2,'E'), (3,'B'), (3,'N'), (3,'D'), (3,'K'),
(3,'W'), (4,'O'), (4,'G'), (4,'N'), (4,'B'), (4,'U'), (4,'C'), (5,'Q'), (5,'H'), (6,'K'),
(6,'Y'), (6,'M'), (6,'A'), (6,'O'), (6,'U'), (6,'J'), (7,'H'), (7,'U'), (7,'M'), (7,'L'),
(8,'B'), (8,'K'), (8,'P'), (9,'Y'), (9,'K'), (10,'Z'), (11,'R'), (12,'X'), (12,'R'),
(12,'O'), (12,'Z'), (4,'C'), (1,'Z'), (4,'S'), (6,'E'), (5,'G'), (4,'C'), (4,'S'), (4,'H'),
(6,'D'), (7,'W'), (3,'U'), (6,'N'), (7,'Y'), (6,'N'), (6,'F'), (4,'C'), (4,'I'), (7,'P'),
(10,'H'), (10,'Z'), (10,'S'), (7,'Z'), (6,'B'), (7,'Z'), (8,'X'), (8,'J'), (8,'P'), (10,'K'),
(8,'K'), (12,'P'), (8,'W'), (10,'M'), (12,'F'), (9,'T'), (9,'D'), (14,'Y'), (12,'P'),
(14,'J'), (13,'D'), (15,'H'), (12,'J'), (6,'H'), (2,'Z'), (8,'G'), (10,'Q'), (6,'D'),
(5,'X'), (9,'T'), (6,'W'), (6,'K'), (10,'W'), (7,'J'), (11,'W'), (12,'V'), (9,'F'), (7,'F'),
(4,'M'), (5,'K'), (12,'G'), (12,'T'), (15,'T'), (13,'W'), (7,'J'), (9,'T'), (10,'U'), (9,'S'),
(10,'L'), (10,'J'), (10,'H'), (11,'H'), (12,'S'), (12,'A'), (14,'L'), (13,'K'), (13,'D'),
(4,'M'), (3,'N'), (4,'F'), (7,'M'), (7,'V'), (5,'R'), (4,'K'), (5,'F'), (7,'G'), (8,'M'),
(4,'X'), (7,'F'), (9,'S'), (7,'N'), (6,'W'), (6,'W'), (5,'S'), (9,'Z'), (10,'I'), (11,'Y'),
(11,'D'), (9,'X'), (7,'G'), (9,'S'), (9,'H'), (9,'T'), (8,'J'), (10,'U'), (9,'F'), (9,'S'),
(7,'D'), (14,'R'), (10,'F'), (7,'E'), (15,'M'), (12,'F'), (5,'C'), (8,'E'), (16,'G'), (11,'V'),
(10,'I'), (12,'I'), (11,'Y'), (12,'I'), (14,'J'), (15,'D'), (19,'J'), (16,'B'), (12,'G'),
(9,'J'), (18,'J'), (18,'C'), (16,'Q'), (18,'P'), (13,'F'), (19,'T'), (15,'J'), (15,'R'),
(15,'Q'), (15,'O'), (11,'A'), (24,'B'), (19,'S'), (22,'I'), (15,'X'), (20,'T'), (15,'E'),
(9,'V'), (8,'H'), (16,'N'), (17,'H')
-- Since the above list was generated by a bunch of random numbers/letters I need to
-- delete the duplicates
;WITH cte AS (
SELECT IDnumber, mut,
row_number() OVER(PARTITION BY IDNumber, mut ORDER BY IDNumber) AS [rn]
FROM @myTable
)
DELETE cte WHERE [rn] > 1
SELECT *
FROM ( SELECT IDnumber, Mut FROM @myTable) AS S
PIVOT
(COUNT(Mut) FOR mut IN ([A],[B],[C],[D],[E],[F],[G],[H],[I],[J],[K],[L],[M],[N],[O],[P],
[Q],[R],[S],[T],[U],[V],[W],[X],[Y],[Z])) AS pvt
所以你可以从枢轴 table 看出最小 IDnumbers
将是 3,5,7 和 12。
如何实现该算法?在我看来,我可以找到所有将成为集合的组合 (2^6),然后确定哪些集合具有所有 Muts。 IDnumbers 最少的集合就是最小集合。
那种蛮力可能会奏效,但效率会非常低。我的真实案例并不大,我有 43 个独特的 Muts
(不是示例中的九个)和 ~2000 IDnumbers
,但我认为这需要一些时间才能 运行 因为2^2000 相当大...
谢谢!
我想要一个更大的数据集来测试,但这至少与给定数据集的结果相匹配...
DECLARE @myTable TABLE (
IDnumber INT,
Mut VARCHAR(1))
DECLARE @results TABLE (
IDnumber INT)
INSERT @myTable VALUES
(1,'A'),(1,'B'),(1,'C'),(1,'D'),
(2,'A'),(2,'C'),(2,'D'),
(3,'A'),(3,'B'),(3,'F'),(3,'Z'),
(4,'A'),(4,'B'),(4,'E'),(4,'F'),
(5,'Y'),
(6,'X'),(6,'Z')
DECLARE @IDnumber INT
WHILE EXISTS (SELECT 1 FROM @myTable)
BEGIN
WITH MutRank AS
(
-- Find how many IDNumbers are associated with each mut
SELECT Mut,
COUNT(DISTINCT IDnumber) AS IDnumberCnt
FROM @myTable
GROUP BY Mut
), MutRarity AS
(
-- Translate the IDNumberCnt into a weighted rarity so dupes at
-- a IDNumberCnt level reduce their rarity compared to the next level
SELECT Mut,
RANK() OVER (ORDER BY IDnumberCnt DESC) AS MutRarity
FROM MutRank
)
-- Grab the IDNumber that is associated to the most "rare" muts
SELECT @IDnumber = n.IDnumber
FROM (SELECT TOP 1 m.IDnumber,
SUM(MutRarity) AS IDNumberValue
FROM @myTable m
JOIN MutRarity mr
ON m.Mut = mr.Mut
GROUP BY m.IDnumber
ORDER BY IDNumberValue DESC, IDnumber) n
-- Save the number in our results
INSERT @results (IDnumber)
SELECT @IDnumber
-- Purge all records for the "covered" muts and the selected IDNumber
DELETE m2
FROM @myTable m1
JOIN @myTable m2
ON m1.Mut = m2.Mut
AND m1.IDnumber = @IDnumber
END
SELECT *
FROM @results
ORDER BY IDnumber
这是一种为每个 IDNumber
计算 Mut
值的位掩码的方法,然后使用递归 CTE 生成完成集合的所有可能组合。代码输出给出最终结果的所有可能的IdNumber
组合,不包括一个或多个IdNumber
在组合中冗余的组合(例如示例数据集中的1,2,3,4,5,6
)。
要将其转换为与您的真实数据一起使用,主要区别在于您几乎肯定需要找到另一种机制来将位值分配给 Mut
值。 (我可以在这里作弊,通过操纵每个 Mut
字母 - POWER(2,ASCII(Mut) - 64)
的十进制 ASCII 码来计算位值)。
DECLARE @myTable TABLE (
IDnumber int,
Mut varchar(1))
INSERT @myTable VALUES
(1,'A'),(1,'B'),(1,'C'),(1,'D'),
(2,'A'),(2,'C'),(2,'D'),
(3,'A'),(3,'B'),(3,'F'),(3,'Z'),
(4,'A'),(4,'B'),(4,'E'),(4,'F'),
(5,'Y'),
(6,'X'),(6,'Z')
-- calculate the target bitmask
DECLARE @target bigint = ( SELECT SUM(POWER(2,ASCII(Mut) - 64))
FROM (SELECT DISTINCT Mut FROM @myTable) AS x
)
;WITH baseCTE
AS
(
--calculate a starting bitmask for each ID
SELECT IDnumber, sum(mutbit) AS bitmask
FROM (SELECT DISTINCT IDnumber, CAST(POWER(2,ASCII(Mut) - 64) AS bigint) AS mutbit FROM @myTable) AS x
GROUP BY IDnumber
)
,recCTE
AS
(
SELECT IDnumber, bitmask, CAST(IdNumber AS varchar(max)) AS trail, 1 AS lev
FROM baseCTE
UNION ALL
SELECT b.IDnumber, b.bitmask | r.bitmask, CAST(CONCAT(r.trail,',',b.IDnumber) AS varchar(max)), r.lev + 1
FROM recCTE AS r
JOIN baseCTE AS b
ON b.IDnumber > r.IDnumber
WHERE b.bitmask | r.bitmask <> r.bitmask --ignore combinations which don't add any new Mut values
)
SELECT trail, lev
FROM recCTE
WHERE bitmask = @target
ORDER BY lev
注意。 SQL 服务器按位运算符仅适用于一个或另一个值是整数类型的情况,因此此解决方案仅限于 64 个不同的 Mut
值(其中掩码是 bigint
)- 以获取要超出此范围,您必须使用自定义的按位比较器(可能使用 CLR)。但是,由于问题提到您有 43 个 Mut
值,它现在可能对您有用。