各组最大值的指示变量
Indicator variable for maximum value by groups
对于以下任务,是否有比下面介绍的方法更优雅的方法:
在多个观察值("key2"以下)的每个组("key1"以下)中创建指标变量("MAX_X1"和"MAX_X2"以下),如果这个观察对应于每个组中变量的最大值,否则为 0
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
proc means data=have noprint;
by key1;
var x1 x2;
output out=max
max= / autoname;
run;
data want;
merge have max;
by key1;
drop _:;
run;
proc sql;
title "MAX";
select name into :MAXvars separated by ' '
from dictionary.columns
WHERE LIBNAME="WORK" AND MEMNAME="WANT" AND NAME like "%_Max"
order by name;
quit;
title;
data want; set want;
array MAX (*) &MAXvars;
array XVars (*) x1 x2;
array Indicators (*) MAX_X1 MAX_X2;
do i=1 to dim(MAX);
if XVars[i]=MAX[i] then Indicators[i]=1; else Indicators[i]=0;
end;
drop i;
run;
感谢任何优化建议
Proc sql 可以与 group by 语句一起使用,以允许跨变量值的汇总函数。
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
proc sql;
create table want
as select
key1,
key2,
x1,
x2,
case
when x1 = max(x1) then 1
else 0 end as max_x1,
case
when x2 = max(x2) then 1
else 0 end as max_x2
from have
group by key1
order by key1, key2;
quit;
也可以在单个数据步骤中执行此操作,前提是您读取输入数据集两次 - 这是双 DOW 循环的示例。
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
/*Sort by key1 (or generate index) if not already sorted*/
proc sort data = have;
by key1;
run;
data want;
if 0 then set have;
array xvars[3,2] x1 x2 x1_max_flag x2_max_flag t_x1_max t_x2_max;
/*1st DOW-loop*/
do _n_ = 1 by 1 until(last.key1);
set have;
by key1;
do i = 1 to 2;
xvars[3,i] = max(xvars[1,i],xvars[3,i]);
end;
end;
/*2nd DOW-loop*/
do _n_ = 1 to _n_;
set have;
do i = 1 to 2;
xvars[2,i] = (xvars[1,i] = xvars[3,i]);
end;
output;
end;
drop i t_:;
run;
这可能理解起来有点复杂,所以这里粗略地解释一下它是如何流动的:
- 使用第一个 DOW 循环逐组读取,在读入每一行时更新滚动最大变量。暂时不要输出任何内容。
- 现在使用第二个 DOW 循环再次读取相同的按组,检查每一行是否等于滚动最大值并输出每一行。
- 返回第一个 DOW 循环,读取下一个分组并重复。
对于以下任务,是否有比下面介绍的方法更优雅的方法:
在多个观察值("key2"以下)的每个组("key1"以下)中创建指标变量("MAX_X1"和"MAX_X2"以下),如果这个观察对应于每个组中变量的最大值,否则为 0
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
proc means data=have noprint;
by key1;
var x1 x2;
output out=max
max= / autoname;
run;
data want;
merge have max;
by key1;
drop _:;
run;
proc sql;
title "MAX";
select name into :MAXvars separated by ' '
from dictionary.columns
WHERE LIBNAME="WORK" AND MEMNAME="WANT" AND NAME like "%_Max"
order by name;
quit;
title;
data want; set want;
array MAX (*) &MAXvars;
array XVars (*) x1 x2;
array Indicators (*) MAX_X1 MAX_X2;
do i=1 to dim(MAX);
if XVars[i]=MAX[i] then Indicators[i]=1; else Indicators[i]=0;
end;
drop i;
run;
感谢任何优化建议
Proc sql 可以与 group by 语句一起使用,以允许跨变量值的汇总函数。
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
proc sql;
create table want
as select
key1,
key2,
x1,
x2,
case
when x1 = max(x1) then 1
else 0 end as max_x1,
case
when x2 = max(x2) then 1
else 0 end as max_x2
from have
group by key1
order by key1, key2;
quit;
也可以在单个数据步骤中执行此操作,前提是您读取输入数据集两次 - 这是双 DOW 循环的示例。
data have;
call streaminit(4321);
do key1=1 to 10;
do key2=1 to 5;
do x1=rand("uniform");
x2=rand("Normal");
output;
end;
end;
end;
run;
/*Sort by key1 (or generate index) if not already sorted*/
proc sort data = have;
by key1;
run;
data want;
if 0 then set have;
array xvars[3,2] x1 x2 x1_max_flag x2_max_flag t_x1_max t_x2_max;
/*1st DOW-loop*/
do _n_ = 1 by 1 until(last.key1);
set have;
by key1;
do i = 1 to 2;
xvars[3,i] = max(xvars[1,i],xvars[3,i]);
end;
end;
/*2nd DOW-loop*/
do _n_ = 1 to _n_;
set have;
do i = 1 to 2;
xvars[2,i] = (xvars[1,i] = xvars[3,i]);
end;
output;
end;
drop i t_:;
run;
这可能理解起来有点复杂,所以这里粗略地解释一下它是如何流动的:
- 使用第一个 DOW 循环逐组读取,在读入每一行时更新滚动最大变量。暂时不要输出任何内容。
- 现在使用第二个 DOW 循环再次读取相同的按组,检查每一行是否等于滚动最大值并输出每一行。
- 返回第一个 DOW 循环,读取下一个分组并重复。