半高效操作数据的 Sas 宏
Sas Macro to semi-efficiently manipulate data
Objective:从拥有 table + 帮助 table 到想要 table。当前的实现(如下)很慢。我相信这是如何不使用 SAS 宏的一个很好的例子,但我很好奇是否...
1. 可以挽救/使宏观方法足够快以使其可行
(例如,proc append 应该可以加速堆叠数据集的操作,但我看不到任何性能提升。)
2. 所有备选方案会是什么样子。
我已经写了一个非宏的解决方案,为了比较起见,我将在下面 post。
Data:
data have ;
input name $ term $;
cards;
Joe 2000
Joe 2002
Joe 2008
Sally 2001
Sally 2003
; run;
proc print ; run;
data help ;
input terms $ ;
cards;
2000
2001
2002
2003
2004
2005
2006
2007
2008
; run;
proc print ; run;
data want ;
input name $ term $ status $;
cards;
Joe 2000 here
Joe 2001 gone
Joe 2002 here
Joe 2003 gone
Joe 2004 gone
Joe 2005 gone
Joe 2006 gone
Joe 2007 gone
Joe 2008 here
Sally 2001 here
Sally 2002 gone
Sally 2003 here
; run;
proc print data=have ; run;
我可以为每个人写一个小宏来让我到达那里:
%MACRO RET(NAME);
proc sql ;
create table studtermlist as
select distinct term
from have
where NAME = "&NAME"
;
SELECT Max(TERM) INTO :MAXTERM
FROM HAVE
WHERE NAME = "&NAME"
;
SELECT MIN(TERM) INTO :MINTERM
FROM HAVE
WHERE NAME = "&NAME"
;
CREATE TABLE TERMLIST AS
SELECT TERMS
FROM HELP
WHERE TERMS BETWEEN "&MINTERM." and "&MAXTERM."
ORDER BY TERMS
;
CREATE TABLE HEREGONE_&Name AS
SELECT
A.terms ,
"&Name" as Name,
CASE
WHEN TERMS EQ TERM THEN 'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b
on a.terms eq b.term
;
quit;
%MEND RET ;
%RET(Joe);
%RET(Sally);
proc print data=HEREGONE_Joe; run;
proc print data=HEREGONE_Sally; run;
但还不完整。如果我循环遍历(大概有很多名字)...
*******need procedure for all names - grab info on have ;
proc sql noprint;
select distinct name into :namelist separated by ' '
from have
; quit;
%let n=&sqlobs ;
%MACRO RETYA ;
OPTIONS NONOTEs ;
%do i = 1 %to &n ;
%let currentvalue = %scan(&namelist,&i);
%put ¤tvalue ;
%put &i ;
%RET(¤tvalue);
%IF &i = 1 %then %do ;
data base; set HEREGONE_¤tvalue; run;
%end;
%IF &i gt 1 %then %do ;
proc sql ; create table base as
select * from base
union
select * from HEREGONE_¤tvalue
;
drop table HEREGONE_¤tvalue;
quit;
%end;
%end ;
OPTIONS NOTES;
%MEND;
%RETYA ;
proc sort data=base ; by name terms; run;
proc print data=base; run;
所以现在我想要,但是有 6,000 个名字,需要 20 多分钟。
让我们尝试替代解决方案。对于每个名称,通过 proc SQL 数据步骤找到 min/max 项。然后使用数据步骤创建时间段 table 并将其与原始 table 合并。
*Sample data;
data have ;
input name $ term ;
cards;
Joe 2000
Joe 2002
Joe 2008
Sally 2001
Sally 2003
; run;
*find min/max of each name;
proc sql;
create table terms as
select name, min(term) as term_min, max(term) as term_max
from have
group by name
order by name;
quit;
*Create table with the time periods for each name;
data empty;
set terms;
do term=term_min to term_max;
output;
end;
drop term_min term_max;
run;
*Create final table by merging the original table with table previously generated;
proc sql;
create table want as
select a.name, a.term, case when missing(b.term) then 'Gone'
else 'Here' end as status
from empty a
left join have b
on a.name=b.name
and a.term=b.term
order by a.name, a.term;
quit;
编辑:现在看看你的宏解决方案,部分问题是你扫描 table 的次数太多了。
- 第一个table,studenttermlist不需要,最后一个join即可
而是被过滤。
- 两个宏变量,min/max项即可
一次计算
- 避免使用较小的临时术语列表并使用 where 子句来过滤结果
- 使用 Call Execute 调用您的宏而不是另一个宏循环
而不是循环追加
数据,利用命名约定并使用单个数据
附加所有输出的步骤。
%MACRO RET(NAME);
proc sql noprint;
SELECT MIN(TERM), Max(TERM) INTO :MINTERM, :MAXTERM
FROM HAVE
WHERE NAME = "&NAME"
;
CREATE TABLE _HG_&Name AS
SELECT
A.terms ,
"&Name" as Name,
CASE
WHEN TERMS EQ TERM THEN 'Here'
when term is null THEN 'Gone'
end as status
from help a
left join have b
on a.terms eq b.term
and b.name="&name"
where a.terms between "&minterm" and "&maxterm";
;
quit;
%MEND RET ;
*call macro;
proc sort data=have;
by name term;
run;
data _null_;
set have;
by name;
if first.name then do;
str=catt('%ret(', name, ');');
call execute(str);
end;
run;
*append results;
data all;
set _hg:;
run;
您实际上可以在单个嵌套 SQL 查询中执行此操作。那样会很乱而且难以阅读。
我要把它分成三个部分。
首先,获取不同的名称;
proc sql noprint;
create table names as
select distinct name from have;
quit;
二、笛卡尔乘积名称和项得到所有组合。
proc sql noprint;
create table temp as
select a.name, b.terms as term
from names as a,
help as b;
quit;
三、左连接查找匹配项
proc sql noprint;
create table want as
select a.name,
a.term,
case
when missing(b.term) then "gone"
else "here"
end as Status
from temp as a
left join
have as b
on a.name=b.name
and a.term=b.term;
quit;
最后,删除临时 table 保存 space;
proc datasets lib=work nolist;
delete temp;
run;
quit;
如 Reeza 所示,还有其他方法可以做到这一点。正如我上面所说,您可以将所有这些合并到一个 SQL 连接中并获得您想要的结果。根据计算机内存和数据大小,它应该没问题(并且可能更快,因为所有内容都在内存中)。
我将给出类似的答案,以便稍后进行比较。
proc sql ;
create table studtermlist as
select distinct term,name
from have
;
create table MAXMINTERM as
SELECT Max(TERM) as MAXTERM, Min(TERM) as MINTERM, name
FROM HAVE
GROUP BY name
;
CREATE TABLE TERMLIST AS
SELECT TERMS,name
FROM HELP a,MAXMINTERM b
WHERE TERMS BETWEEN MINTERM and MAXTERM
ORDER BY name,TERMS
;
CREATE TABLE HEREGONE AS
SELECT
a.terms ,
a.Name ,
CASE
WHEN TERMS EQ TERM THEN 'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b
on a.terms eq b.term
and a.name eq b.name
order by name, terms
;
quit;
proc sql;
create table want as
select c.name, c.terms, a.term,
( case when missing(a.term) then "Gone"
else "Here" end ) as status
from (select distinct a.name, b.terms
from have a, help b) c
left join have a
on c.terms = a.term and c.name = a.name
order by c.name, c.terms, a.term
;
Objective:从拥有 table + 帮助 table 到想要 table。当前的实现(如下)很慢。我相信这是如何不使用 SAS 宏的一个很好的例子,但我很好奇是否... 1. 可以挽救/使宏观方法足够快以使其可行 (例如,proc append 应该可以加速堆叠数据集的操作,但我看不到任何性能提升。) 2. 所有备选方案会是什么样子。
我已经写了一个非宏的解决方案,为了比较起见,我将在下面 post。
Data:
data have ;
input name $ term $;
cards;
Joe 2000
Joe 2002
Joe 2008
Sally 2001
Sally 2003
; run;
proc print ; run;
data help ;
input terms $ ;
cards;
2000
2001
2002
2003
2004
2005
2006
2007
2008
; run;
proc print ; run;
data want ;
input name $ term $ status $;
cards;
Joe 2000 here
Joe 2001 gone
Joe 2002 here
Joe 2003 gone
Joe 2004 gone
Joe 2005 gone
Joe 2006 gone
Joe 2007 gone
Joe 2008 here
Sally 2001 here
Sally 2002 gone
Sally 2003 here
; run;
proc print data=have ; run;
我可以为每个人写一个小宏来让我到达那里:
%MACRO RET(NAME);
proc sql ;
create table studtermlist as
select distinct term
from have
where NAME = "&NAME"
;
SELECT Max(TERM) INTO :MAXTERM
FROM HAVE
WHERE NAME = "&NAME"
;
SELECT MIN(TERM) INTO :MINTERM
FROM HAVE
WHERE NAME = "&NAME"
;
CREATE TABLE TERMLIST AS
SELECT TERMS
FROM HELP
WHERE TERMS BETWEEN "&MINTERM." and "&MAXTERM."
ORDER BY TERMS
;
CREATE TABLE HEREGONE_&Name AS
SELECT
A.terms ,
"&Name" as Name,
CASE
WHEN TERMS EQ TERM THEN 'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b
on a.terms eq b.term
;
quit;
%MEND RET ;
%RET(Joe);
%RET(Sally);
proc print data=HEREGONE_Joe; run;
proc print data=HEREGONE_Sally; run;
但还不完整。如果我循环遍历(大概有很多名字)...
*******need procedure for all names - grab info on have ;
proc sql noprint;
select distinct name into :namelist separated by ' '
from have
; quit;
%let n=&sqlobs ;
%MACRO RETYA ;
OPTIONS NONOTEs ;
%do i = 1 %to &n ;
%let currentvalue = %scan(&namelist,&i);
%put ¤tvalue ;
%put &i ;
%RET(¤tvalue);
%IF &i = 1 %then %do ;
data base; set HEREGONE_¤tvalue; run;
%end;
%IF &i gt 1 %then %do ;
proc sql ; create table base as
select * from base
union
select * from HEREGONE_¤tvalue
;
drop table HEREGONE_¤tvalue;
quit;
%end;
%end ;
OPTIONS NOTES;
%MEND;
%RETYA ;
proc sort data=base ; by name terms; run;
proc print data=base; run;
所以现在我想要,但是有 6,000 个名字,需要 20 多分钟。
让我们尝试替代解决方案。对于每个名称,通过 proc SQL 数据步骤找到 min/max 项。然后使用数据步骤创建时间段 table 并将其与原始 table 合并。
*Sample data;
data have ;
input name $ term ;
cards;
Joe 2000
Joe 2002
Joe 2008
Sally 2001
Sally 2003
; run;
*find min/max of each name;
proc sql;
create table terms as
select name, min(term) as term_min, max(term) as term_max
from have
group by name
order by name;
quit;
*Create table with the time periods for each name;
data empty;
set terms;
do term=term_min to term_max;
output;
end;
drop term_min term_max;
run;
*Create final table by merging the original table with table previously generated;
proc sql;
create table want as
select a.name, a.term, case when missing(b.term) then 'Gone'
else 'Here' end as status
from empty a
left join have b
on a.name=b.name
and a.term=b.term
order by a.name, a.term;
quit;
编辑:现在看看你的宏解决方案,部分问题是你扫描 table 的次数太多了。
- 第一个table,studenttermlist不需要,最后一个join即可 而是被过滤。
- 两个宏变量,min/max项即可 一次计算
- 避免使用较小的临时术语列表并使用 where 子句来过滤结果
- 使用 Call Execute 调用您的宏而不是另一个宏循环
而不是循环追加 数据,利用命名约定并使用单个数据 附加所有输出的步骤。
%MACRO RET(NAME); proc sql noprint; SELECT MIN(TERM), Max(TERM) INTO :MINTERM, :MAXTERM FROM HAVE WHERE NAME = "&NAME" ; CREATE TABLE _HG_&Name AS SELECT A.terms , "&Name" as Name, CASE WHEN TERMS EQ TERM THEN 'Here' when term is null THEN 'Gone' end as status from help a left join have b on a.terms eq b.term and b.name="&name" where a.terms between "&minterm" and "&maxterm"; ; quit; %MEND RET ; *call macro; proc sort data=have; by name term; run; data _null_; set have; by name; if first.name then do; str=catt('%ret(', name, ');'); call execute(str); end; run; *append results; data all; set _hg:; run;
您实际上可以在单个嵌套 SQL 查询中执行此操作。那样会很乱而且难以阅读。
我要把它分成三个部分。
首先,获取不同的名称;
proc sql noprint;
create table names as
select distinct name from have;
quit;
二、笛卡尔乘积名称和项得到所有组合。
proc sql noprint;
create table temp as
select a.name, b.terms as term
from names as a,
help as b;
quit;
三、左连接查找匹配项
proc sql noprint;
create table want as
select a.name,
a.term,
case
when missing(b.term) then "gone"
else "here"
end as Status
from temp as a
left join
have as b
on a.name=b.name
and a.term=b.term;
quit;
最后,删除临时 table 保存 space;
proc datasets lib=work nolist;
delete temp;
run;
quit;
如 Reeza 所示,还有其他方法可以做到这一点。正如我上面所说,您可以将所有这些合并到一个 SQL 连接中并获得您想要的结果。根据计算机内存和数据大小,它应该没问题(并且可能更快,因为所有内容都在内存中)。
我将给出类似的答案,以便稍后进行比较。
proc sql ;
create table studtermlist as
select distinct term,name
from have
;
create table MAXMINTERM as
SELECT Max(TERM) as MAXTERM, Min(TERM) as MINTERM, name
FROM HAVE
GROUP BY name
;
CREATE TABLE TERMLIST AS
SELECT TERMS,name
FROM HELP a,MAXMINTERM b
WHERE TERMS BETWEEN MINTERM and MAXTERM
ORDER BY name,TERMS
;
CREATE TABLE HEREGONE AS
SELECT
a.terms ,
a.Name ,
CASE
WHEN TERMS EQ TERM THEN 'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b
on a.terms eq b.term
and a.name eq b.name
order by name, terms
;
quit;
proc sql;
create table want as
select c.name, c.terms, a.term,
( case when missing(a.term) then "Gone"
else "Here" end ) as status
from (select distinct a.name, b.terms
from have a, help b) c
left join have a
on c.terms = a.term and c.name = a.name
order by c.name, c.terms, a.term
;