各组最大值的指示变量

Indicator variable for maximum value by groups

对于以下任务,是否有比下面介绍的方法更优雅的方法:

在多个观察值("key2"以下)的每个组("key1"以下)中创建指标变量("MAX_X1"和"MAX_X2"以下),如果这个观察对应于每个组中变量的最大值,否则为 0

data have;
call streaminit(4321);
do key1=1 to 10;
 do key2=1 to 5;
  do x1=rand("uniform");
     x2=rand("Normal");
     output;
  end;
 end;
end;
run;

proc means data=have noprint;
by key1;
var x1 x2;
output out=max
max= / autoname;
run;

data want;
merge have max;
by key1;
drop _:;
run;

proc sql;
    title "MAX";
    select name into :MAXvars separated by ' '
        from dictionary.columns
            WHERE LIBNAME="WORK" AND MEMNAME="WANT" AND NAME like "%_Max"
                        order by name;
quit;
title;

data want; set want;
    array MAX  (*) &MAXvars;
    array XVars (*) x1 x2;
    array Indicators (*) MAX_X1 MAX_X2;
    do i=1 to dim(MAX);
     if XVars[i]=MAX[i] then Indicators[i]=1; else Indicators[i]=0;
    end;
drop i;
run;

感谢任何优化建议

Proc sql 可以与 group by 语句一起使用,以允许跨变量值的汇总函数。

    data have;
    call streaminit(4321);
    do key1=1 to 10;
     do key2=1 to 5;
      do x1=rand("uniform");
         x2=rand("Normal");
         output;
      end;
     end;
    end;
    run;

    proc sql;
        create table want
        as select
        key1,
        key2,
        x1,
        x2,
        case 
            when x1 = max(x1) then 1
            else 0 end as max_x1,
        case
            when x2 = max(x2) then 1
            else 0 end as max_x2
        from have
        group by key1
        order by key1, key2;
    quit;

也可以在单个数据步骤中执行此操作,前提是您读取输入数据集两次 - 这是双 DOW 循环的示例。

data have;
call streaminit(4321);
do key1=1 to 10;
 do key2=1 to 5;
  do x1=rand("uniform");
     x2=rand("Normal");
     output;
  end;
 end;
end;
run;

/*Sort by key1 (or generate index) if not already sorted*/
proc sort data = have;
    by key1;
run;

data want;
    if 0 then set have;
    array xvars[3,2] x1 x2 x1_max_flag x2_max_flag t_x1_max t_x2_max;
    /*1st DOW-loop*/
    do _n_ = 1 by 1 until(last.key1);
        set have;
        by  key1;
        do i = 1 to 2;
            xvars[3,i] = max(xvars[1,i],xvars[3,i]);
        end;
    end;
    /*2nd DOW-loop*/
    do _n_ = 1 to _n_;
        set have;
        do i = 1 to 2;
            xvars[2,i] = (xvars[1,i] = xvars[3,i]);
        end;
        output;
    end;
    drop i t_:;
run;

这可能理解起来有点复杂,所以这里粗略地解释一下它是如何流动的:

  • 使用第一个 DOW 循环逐组读取,在读入每一行时更新滚动最大变量。暂时不要输出任何内容。
  • 现在使用第二个 DOW 循环再次读取相同的按组,检查每一行是否等于滚动最大值并输出每一行。
  • 返回第一个 DOW 循环,读取下一个分组并重复。