连续序列的分区函数
Partitioning function for continuous sequences
有一个table结构如下:
CREATE TABLE history
(
pk serial NOT NULL,
"from" integer NOT NULL,
"to" integer NOT NULL,
entity_key text NOT NULL,
data text NOT NULL,
CONSTRAINT history_pkey PRIMARY KEY (pk)
);
pk
是一个主键,from
和 to
定义了序列中的一个位置,以及由 entity_key
标识的给定实体的序列本身。因此,如果第一行具有 from = 1; to = 2
而第二行具有 from = 2; to = 3
,则该实体具有一个 2 行序列。所以这里的重点是前一行的 to
与下一行的 from
匹配。
确定 "next"/"previous" 行的顺序由单调增长的 pk
定义(因为它是 SERIAL
)。
序列不必从 1 开始,to - from
也不一定总是 1。所以可以是from = 1; to = 10
。重要的是序列中的 "next" 行与 to
完全匹配。
示例数据集:
pk | from | to | entity_key | data
----+--------+------+--------------+-------
1 | 1 | 2 | 42 | foo
2 | 2 | 3 | 42 | bar
3 | 3 | 4 | 42 | baz
4 | 10 | 11 | 42 | another foo
5 | 11 | 12 | 42 | another baz
6 | 1 | 2 | 111 | one one one
7 | 2 | 3 | 111 | one one one two
8 | 3 | 4 | 111 | one one one three
我无法意识到如何在此处按 "sequences" 进行分区,以便我可以将 window 函数应用于代表单个 "sequence".
的组
假设我想使用 row_number()
函数并希望获得以下结果:
pk | row_number | entity_key
----+-------------+------------
1 | 1 | 42
2 | 2 | 42
3 | 3 | 42
4 | 1 | 42
5 | 2 | 42
6 | 1 | 111
7 | 2 | 111
8 | 3 | 111
为方便起见,我用初始种子创建了一个 SQLFiddle:http://sqlfiddle.com/#!15/e7c1c
PS:这不是"give me the codez"的问题,我自己研究了一下,只是不知道如何分区。
很明显我需要 LEFT JOIN
和 next.from = curr.to
,但是仍然不清楚如何在 next.from IS NULL
上重置分区。
PS:提供请求结果的最优雅的查询将获得 100 点赏金
PPS:所需的解决方案应该是 SQL 查询而不是 pgsql,因为一些其他限制超出了这个问题的范围。
我不知道这算不算“优雅”,但我想这会做你想要的:
with Lagged as (
select
pk,
case when lag("to",1) over (order by pk) is distinct from "from" then 1 else 0 end as starts,
entity_key
from history
), LaggedGroups as (
select
pk,
sum(starts) over (order by pk) as groups,
entity_key
from Lagged
)
select
pk,
row_number() over (
partition by groups
order by pk
) as "row_number",
entity_key
from LaggedGroups
只是为了好玩和完整:一个递归解决方案来重建(双向)记录链表。 [这不是最快的解决方案]
注意:我注释掉了升序 pk 条件,因为连接逻辑不需要它们。
WITH RECURSIVE zzz AS (
SELECT h0.pk
, h0."to" AS next
, h0.entity_key AS ek
, 1::integer AS rnk
FROM history h0
WHERE NOT EXISTS (
SELECT * FROM history nx
WHERE nx.entity_key = h0.entity_key
AND nx."to" = h0."from"
-- AND nx.pk > h0.pk
)
UNION ALL
SELECT h1.pk
, h1."to" AS next
, h1.entity_key AS ek
, 1+zzz.rnk AS rnk
FROM zzz
JOIN history h1
ON h1.entity_key = zzz.ek
AND h1."from" = zzz.next
-- AND h1.pk > zzz.pk
)
SELECT * FROM zzz
ORDER BY ek,pk
;
您可以使用generate_series()
生成两个值之间的所有行。然后你可以使用行号的差异:
select pk, "from", "to",
row_number() over (partition by entity_key, min(grp) order by pk) as row_number
from (select h.*,
(row_number() over (partition by entity_key order by ind) -
ind) as grp
from (select h.*, generate_series("from", "to" - 1) as ind
from history h
) h
) h
group by pk, "from", "to", entity_key
因为您指定的差值介于 1 和 10 之间,所以这实际上可能不会有这么差的性能。
很遗憾,您的 SQL Fiddle 目前无法正常工作,因此我无法对其进行测试。
嗯,
这不完全是 one SQL 查询而是:
select a.pk as PK, a.entity_key as ENTITY_KEY, b.pk as BPK, 0 as Seq into #tmp
from history a left join history b on a."to" = b."from" and a.pk = b.pk-1
declare @seq int
select @seq = 1
update #tmp set Seq = case when (BPK is null) then @seq-1 else @seq end,
@seq = case when (BPK is null) then @seq+1 else @seq end
select pk, entity_key, ROW_NUMBER() over (PARTITION by entity_key, seq order by pk asc)
from #tmp order by pk
这是在 SQL Server 2008
有一个table结构如下:
CREATE TABLE history
(
pk serial NOT NULL,
"from" integer NOT NULL,
"to" integer NOT NULL,
entity_key text NOT NULL,
data text NOT NULL,
CONSTRAINT history_pkey PRIMARY KEY (pk)
);
pk
是一个主键,from
和 to
定义了序列中的一个位置,以及由 entity_key
标识的给定实体的序列本身。因此,如果第一行具有 from = 1; to = 2
而第二行具有 from = 2; to = 3
,则该实体具有一个 2 行序列。所以这里的重点是前一行的 to
与下一行的 from
匹配。
确定 "next"/"previous" 行的顺序由单调增长的 pk
定义(因为它是 SERIAL
)。
序列不必从 1 开始,to - from
也不一定总是 1。所以可以是from = 1; to = 10
。重要的是序列中的 "next" 行与 to
完全匹配。
示例数据集:
pk | from | to | entity_key | data
----+--------+------+--------------+-------
1 | 1 | 2 | 42 | foo
2 | 2 | 3 | 42 | bar
3 | 3 | 4 | 42 | baz
4 | 10 | 11 | 42 | another foo
5 | 11 | 12 | 42 | another baz
6 | 1 | 2 | 111 | one one one
7 | 2 | 3 | 111 | one one one two
8 | 3 | 4 | 111 | one one one three
我无法意识到如何在此处按 "sequences" 进行分区,以便我可以将 window 函数应用于代表单个 "sequence".
的组假设我想使用 row_number()
函数并希望获得以下结果:
pk | row_number | entity_key
----+-------------+------------
1 | 1 | 42
2 | 2 | 42
3 | 3 | 42
4 | 1 | 42
5 | 2 | 42
6 | 1 | 111
7 | 2 | 111
8 | 3 | 111
为方便起见,我用初始种子创建了一个 SQLFiddle:http://sqlfiddle.com/#!15/e7c1c
PS:这不是"give me the codez"的问题,我自己研究了一下,只是不知道如何分区。
很明显我需要 LEFT JOIN
和 next.from = curr.to
,但是仍然不清楚如何在 next.from IS NULL
上重置分区。
PS:提供请求结果的最优雅的查询将获得 100 点赏金
PPS:所需的解决方案应该是 SQL 查询而不是 pgsql,因为一些其他限制超出了这个问题的范围。
我不知道这算不算“优雅”,但我想这会做你想要的:
with Lagged as (
select
pk,
case when lag("to",1) over (order by pk) is distinct from "from" then 1 else 0 end as starts,
entity_key
from history
), LaggedGroups as (
select
pk,
sum(starts) over (order by pk) as groups,
entity_key
from Lagged
)
select
pk,
row_number() over (
partition by groups
order by pk
) as "row_number",
entity_key
from LaggedGroups
只是为了好玩和完整:一个递归解决方案来重建(双向)记录链表。 [这不是最快的解决方案]
注意:我注释掉了升序 pk 条件,因为连接逻辑不需要它们。
WITH RECURSIVE zzz AS (
SELECT h0.pk
, h0."to" AS next
, h0.entity_key AS ek
, 1::integer AS rnk
FROM history h0
WHERE NOT EXISTS (
SELECT * FROM history nx
WHERE nx.entity_key = h0.entity_key
AND nx."to" = h0."from"
-- AND nx.pk > h0.pk
)
UNION ALL
SELECT h1.pk
, h1."to" AS next
, h1.entity_key AS ek
, 1+zzz.rnk AS rnk
FROM zzz
JOIN history h1
ON h1.entity_key = zzz.ek
AND h1."from" = zzz.next
-- AND h1.pk > zzz.pk
)
SELECT * FROM zzz
ORDER BY ek,pk
;
您可以使用generate_series()
生成两个值之间的所有行。然后你可以使用行号的差异:
select pk, "from", "to",
row_number() over (partition by entity_key, min(grp) order by pk) as row_number
from (select h.*,
(row_number() over (partition by entity_key order by ind) -
ind) as grp
from (select h.*, generate_series("from", "to" - 1) as ind
from history h
) h
) h
group by pk, "from", "to", entity_key
因为您指定的差值介于 1 和 10 之间,所以这实际上可能不会有这么差的性能。
很遗憾,您的 SQL Fiddle 目前无法正常工作,因此我无法对其进行测试。
嗯, 这不完全是 one SQL 查询而是:
select a.pk as PK, a.entity_key as ENTITY_KEY, b.pk as BPK, 0 as Seq into #tmp
from history a left join history b on a."to" = b."from" and a.pk = b.pk-1
declare @seq int
select @seq = 1
update #tmp set Seq = case when (BPK is null) then @seq-1 else @seq end,
@seq = case when (BPK is null) then @seq+1 else @seq end
select pk, entity_key, ROW_NUMBER() over (PARTITION by entity_key, seq order by pk asc)
from #tmp order by pk
这是在 SQL Server 2008