确定重叠时间范围内的差距
Determining gaps in overlapping time ranges
我有一组用户的开始和结束日期数据集 - 假设在此日期范围内,用户是 'on'。
对于给定用户,日期范围可能相互重叠。我希望能够找到这些范围之间的差距,即用户不是 'on'.
时的日期范围
这是我正在使用的来源示例:
data blocks;
infile datalines dsd missover;
input uniq:. start_dt:datetime. end_dt:datetime.;
format start_dt end_dt datetime.;
datalines;
A,01JAN2021:08:00:00,01JAN2021:10:00:00
B,01JAN2021:18:00:00,01JAN2021:20:00:00
B,01JAN2021:09:00:00,01JAN2021:11:00:00
B,02JAN2021:11:00:00,02JAN2021:15:00:00
A,02JAN2021:10:00:00,02JAN2021:12:00:00
B,02JAN2021:10:00:00,02JAN2021:15:00:00
B,03JAN2021:09:00:00,03JAN2021:10:00:00
;;;;
run;
我希望输出是这样的:
A 01JAN21:10:00:00 02JAN21:10:00:00
B 01JAN21:11:00:00 01JAN21:18:00:00
B 01JAN21:20:00:00 02JAN21:10:00:00
B 02JAN21:15:00:00 03JAN21:09:00:00
我之前使用 SQL 找到了一个解决方案(恐怕我不再有 link),但这将数据集的五个实例与其自身连接起来,对我来说是行不通的,观测值可能达到数千万。
就是这个方法,有效,但对我的实际使用数据不实用:
proc sql;
create table gaps_sql as
select distinct a.uniq
, a.end_dt as last_end_dt
, b.start_dt as next_start_dt
from blocks a
inner join blocks b
on a.end_dt < b.start_dt
and a.uniq = b.uniq
and b.start_dt = (
select min(start_dt)
from blocks c
where c.start_dt > a.end_dt
and c.uniq = b.uniq
)
where not exists (
select * from blocks d
where d.start_dt < a.end_dt
and d.end_dt > a.end_dt
and d.uniq = a.uniq
)
order by a.uniq, a.end_dt
;quit;
有没有一种方法可以更有效地做到这一点,并且可以在合理的时间内完成许多观察?也许是数据步骤或类似步骤?
使用有序散列来存储和排序日期时间范围。
迭代哈希条目以确定并输出间隙。
示例:
data have;
infile datalines dsd missover;
input id:. start_dt:datetime. end_dt:datetime.;
format start_dt end_dt datetime.;
datalines;
A,01JAN2021:08:00:00,01JAN2021:10:00:00
B,01JAN2021:18:00:00,01JAN2021:20:00:00
B,01JAN2021:09:00:00,01JAN2021:11:00:00
B,02JAN2021:11:00:00,02JAN2021:15:00:00
A,02JAN2021:10:00:00,02JAN2021:12:00:00
B,02JAN2021:10:00:00,02JAN2021:15:00:00
B,03JAN2021:09:00:00,03JAN2021:10:00:00
;
data want(keep=id gap_start gap_end);
length id start_dt end_dt 8;
if _n_ = 1 then do;
declare hash ranges(ordered:'a');
ranges.defineKey('id', 'start_dt', 'end_dt');
ranges.defineDone();
call missing (id, start_dt, end_dt);
end;
set have end=done;
rc = ranges.add();
if done then do;
declare hiter i1('ranges');
do while (i1.next()=0);
if id ne lag(id) then do;
right = end_dt;
end;
else do;
if start_dt > right then do;
* gap;
gap_start = right;
gap_end = start_dt;
output;
right = end_dt;
end;
else do;
* overlap, possible range extension;
right = max(right, end_dt);
end;
end;
end;
end;
format gap: datetime.;
run;
我有一组用户的开始和结束日期数据集 - 假设在此日期范围内,用户是 'on'。
对于给定用户,日期范围可能相互重叠。我希望能够找到这些范围之间的差距,即用户不是 'on'.
时的日期范围这是我正在使用的来源示例:
data blocks;
infile datalines dsd missover;
input uniq:. start_dt:datetime. end_dt:datetime.;
format start_dt end_dt datetime.;
datalines;
A,01JAN2021:08:00:00,01JAN2021:10:00:00
B,01JAN2021:18:00:00,01JAN2021:20:00:00
B,01JAN2021:09:00:00,01JAN2021:11:00:00
B,02JAN2021:11:00:00,02JAN2021:15:00:00
A,02JAN2021:10:00:00,02JAN2021:12:00:00
B,02JAN2021:10:00:00,02JAN2021:15:00:00
B,03JAN2021:09:00:00,03JAN2021:10:00:00
;;;;
run;
我希望输出是这样的:
A 01JAN21:10:00:00 02JAN21:10:00:00
B 01JAN21:11:00:00 01JAN21:18:00:00
B 01JAN21:20:00:00 02JAN21:10:00:00
B 02JAN21:15:00:00 03JAN21:09:00:00
我之前使用 SQL 找到了一个解决方案(恐怕我不再有 link),但这将数据集的五个实例与其自身连接起来,对我来说是行不通的,观测值可能达到数千万。
就是这个方法,有效,但对我的实际使用数据不实用:
proc sql;
create table gaps_sql as
select distinct a.uniq
, a.end_dt as last_end_dt
, b.start_dt as next_start_dt
from blocks a
inner join blocks b
on a.end_dt < b.start_dt
and a.uniq = b.uniq
and b.start_dt = (
select min(start_dt)
from blocks c
where c.start_dt > a.end_dt
and c.uniq = b.uniq
)
where not exists (
select * from blocks d
where d.start_dt < a.end_dt
and d.end_dt > a.end_dt
and d.uniq = a.uniq
)
order by a.uniq, a.end_dt
;quit;
有没有一种方法可以更有效地做到这一点,并且可以在合理的时间内完成许多观察?也许是数据步骤或类似步骤?
使用有序散列来存储和排序日期时间范围。 迭代哈希条目以确定并输出间隙。
示例:
data have;
infile datalines dsd missover;
input id:. start_dt:datetime. end_dt:datetime.;
format start_dt end_dt datetime.;
datalines;
A,01JAN2021:08:00:00,01JAN2021:10:00:00
B,01JAN2021:18:00:00,01JAN2021:20:00:00
B,01JAN2021:09:00:00,01JAN2021:11:00:00
B,02JAN2021:11:00:00,02JAN2021:15:00:00
A,02JAN2021:10:00:00,02JAN2021:12:00:00
B,02JAN2021:10:00:00,02JAN2021:15:00:00
B,03JAN2021:09:00:00,03JAN2021:10:00:00
;
data want(keep=id gap_start gap_end);
length id start_dt end_dt 8;
if _n_ = 1 then do;
declare hash ranges(ordered:'a');
ranges.defineKey('id', 'start_dt', 'end_dt');
ranges.defineDone();
call missing (id, start_dt, end_dt);
end;
set have end=done;
rc = ranges.add();
if done then do;
declare hiter i1('ranges');
do while (i1.next()=0);
if id ne lag(id) then do;
right = end_dt;
end;
else do;
if start_dt > right then do;
* gap;
gap_start = right;
gap_end = start_dt;
output;
right = end_dt;
end;
else do;
* overlap, possible range extension;
right = max(right, end_dt);
end;
end;
end;
end;
format gap: datetime.;
run;