半高效操作数据的 Sas 宏

Sas Macro to semi-efficiently manipulate data

Objective:从拥有 table + 帮助 table 到想要 table。当前的实现(如下)很慢。我相信这是如何不使用 SAS 宏的一个很好的例子,但我很好奇是否... 1. 可以挽救/使宏观方法足够快以使其可行 (例如,proc append 应该可以加速堆叠数据集的操作,但我看不到任何性能提升。) 2. 所有备选方案会是什么样子。

我已经写了一个非宏的解决方案,为了比较起见,我将在下面 post。

Data: 
data have ; 
input name $ term $; 
cards;
Joe   2000 
Joe   2002
Joe   2008 
Sally 2001
Sally 2003
; run; 

proc print ; run; 

data help ; 
input terms $ ; 
cards; 
2000
2001
2002
2003
2004
2005
2006
2007
2008
; run; 

proc print ; run; 

data want ; 
input name $ term $ status $; 
cards;
Joe   2000  here
Joe   2001  gone
Joe   2002  here
Joe   2003  gone
Joe   2004  gone
Joe   2005  gone
Joe   2006  gone
Joe   2007  gone
Joe   2008  here
Sally 2001  here
Sally 2002  gone
Sally 2003  here
; run; 

proc print data=have ; run; 

我可以为每个人写一个小宏来让我到达那里:

%MACRO RET(NAME); 
proc sql ; 
create table studtermlist as 
select distinct term 
from have 
where NAME = "&NAME"
; 
SELECT Max(TERM) INTO :MAXTERM 
FROM HAVE
WHERE NAME = "&NAME"
; 
SELECT MIN(TERM) INTO :MINTERM 
FROM HAVE
WHERE NAME = "&NAME"
; 
CREATE TABLE TERMLIST AS 
SELECT TERMS  
FROM HELP 
WHERE TERMS BETWEEN "&MINTERM." and "&MAXTERM."
ORDER BY TERMS 
;
CREATE TABLE HEREGONE_&Name AS 
SELECT 
A.terms , 
"&Name" as Name,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b 
 on a.terms eq b.term 
; 
quit; 
%MEND RET ; 


%RET(Joe);
%RET(Sally);

proc print data=HEREGONE_Joe; run; 
proc print data=HEREGONE_Sally; run; 

但还不完整。如果我循环遍历(大概有很多名字)...

*******need procedure for all names - grab info on have ; 
proc sql noprint; 
select distinct name into :namelist separated by ' '
from have
; quit;

%let n=&sqlobs ; 


%MACRO RETYA ; 
OPTIONS NONOTEs ; 
%do i = 1 %to &n ; 
 %let currentvalue = %scan(&namelist,&i); 
 %put &currentvalue ; 
 %put &i ; 
%RET(&currentvalue);
%IF &i = 1 %then %do ; 
data base; set HEREGONE_&currentvalue; run; 
                 %end; 
%IF &i gt 1 %then %do ; 
proc sql ; create table base as 
select * from base
union 
select * from HEREGONE_&currentvalue
;
drop table HEREGONE_&currentvalue;
quit;
                 %end; 
%end ; 
OPTIONS NOTES; 
%MEND; 

%RETYA ; 

proc sort data=base ; by name terms; run; 
proc print data=base; run; 

所以现在我想要,但是有 6,000 个名字,需要 20 多分钟。

让我们尝试替代解决方案。对于每个名称,通过 proc SQL 数据步骤找到 min/max 项。然后使用数据步骤创建时间段 table 并将其与原始 table 合并。

*Sample data;
data have ; 
input name $ term ; 
cards;
Joe   2000 
Joe   2002
Joe   2008 
Sally 2001
Sally 2003
; run; 

*find min/max of each name;
proc sql;
create table terms as
select name, min(term) as term_min, max(term) as term_max
from have
group by name
order by name;
quit;

*Create table with the time periods for each name;
data empty;
set terms;
do term=term_min to term_max;
output;
end;
drop term_min term_max;
run;

*Create final table by merging the original table with table previously generated;
proc sql;
create table want as
select a.name, a.term, case when missing(b.term) then 'Gone'
                        else 'Here' end as status
from empty a
left join have b
on a.name=b.name
and a.term=b.term
order by a.name, a.term;
quit;

编辑:现在看看你的宏解决方案,部分问题是你扫描 table 的次数太多了。

  • 第一个table,studenttermlist不需要,最后一个join即可 而是被过滤。
  • 两个宏变量,min/max项即可 一次计算
  • 避免使用较小的临时术语列表并使用 where 子句来过滤结果
  • 使用 Call Execute 调用您的宏而不是另一个宏循环
  • 而不是循环追加 数据,利用命名约定并使用单个数据 附加所有输出的步骤。

    %MACRO RET(NAME); 
    proc sql noprint; 
    
    SELECT MIN(TERM), Max(TERM) INTO :MINTERM,  :MAXTERM
    FROM HAVE
    WHERE NAME = "&NAME"
    ; 
    
    
    CREATE TABLE _HG_&Name AS 
    SELECT 
    A.terms , 
    "&Name" as Name,
    CASE 
    WHEN TERMS EQ TERM THEN  'Here'
    when term is null THEN 'Gone'
    end as status
    from help a 
    left join have b 
     on a.terms eq b.term 
     and b.name="&name"
     where a.terms between "&minterm" and "&maxterm";
    ; 
    quit; 
    %MEND RET ; 
    
    
    *call macro;
    proc sort data=have;
    by name term;
    run;
    
    data _null_;
        set have;
        by name;
        if first.name then do;
        str=catt('%ret(', name, ');');
        call execute(str);
        end;
    run;
    
    
    *append results;
    data all;
        set _hg:;
    run;
    

您实际上可以在单个嵌套 SQL 查询中执行此操作。那样会很乱而且难以阅读。

我要把它分成三个部分。

首先,获取不同的名称;

proc sql noprint;
create table names as
select distinct name from have;
quit;

二、笛卡尔乘积名称和项得到所有组合。

proc sql noprint;
create table temp as
select a.name, b.terms as term
from names as a,
     help as b;
quit;

三、左连接查找匹配项

proc sql noprint;
create table want as
select a.name,
       a.term,
       case
          when missing(b.term) then "gone"
          else "here"
       end as Status
from temp as a
left join
     have as b
on a.name=b.name
and a.term=b.term;
quit;

最后,删除临时 table 保存 space;

proc datasets lib=work nolist;
delete temp;
run;
quit;

如 Reeza 所示,还有其他方法可以做到这一点。正如我上面所说,您可以将所有这些合并到一个 SQL 连接中并获得您想要的结果。根据计算机内存和数据大小,它应该没问题(并且可能更快,因为所有内容都在内存中)。

我将给出类似的答案,以便稍后进行比较。

proc sql ; 
create table studtermlist as 
select distinct term,name 
from have 
; 
create table MAXMINTERM as 
SELECT Max(TERM) as MAXTERM, Min(TERM) as MINTERM, name  
FROM HAVE
GROUP BY name
; 
CREATE TABLE TERMLIST AS 
SELECT TERMS,name  
FROM HELP a,MAXMINTERM b 
WHERE TERMS BETWEEN MINTERM and MAXTERM
ORDER BY name,TERMS 
;
CREATE TABLE HEREGONE AS 
SELECT 
a.terms , 
a.Name  ,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b 
 on a.terms eq b.term
 and a.name eq b.name 
order by name, terms
; 
quit; 
proc sql;
create table want as
select c.name, c.terms, a.term, 
       ( case when missing(a.term) then "Gone"
           else "Here" end ) as status
from (select distinct a.name, b.terms
      from have a, help b) c
left join have a
on c.terms = a.term and c.name = a.name
order by c.name, c.terms, a.term
;