SAS:装箱数据
SAS: binning data
data scores;
length variables $ 16;
input variables $ low high score;
datalines;
Debt -10000 1 55
Debt 1 10000 23
MAX_NA -1 1 500
MAX_NA 1 100 -240
;
data main_data;
input ID Debt MAX_NA;
datalines;
222554 7584 12
212552 20 0
883123 500 7
913464 -200 -78
;
data end_result;
input ID Debt MAX_NA score;
datalines;
222554 7584 12 -217
212552 20 0 523
883123 500 7 -185
913464 -200 -78 555
;
上面你会发现三个数据集。
scores 数据集描述了每个变量的得分,基于低列和高列之间的一系列值。
第二个数据集main_data显示了Debt和MAX_NA的确切值。
end_result table是我想达到的效果
我应该使用什么步骤和语句来计算分数并得到 end_result table?
我不明白为什么id 222554和883123没有得到相同的分数?
无论如何,这里有一个方法可以用作模板。
data end_result;
if _N_ = 1 then do;
dcl hash h(dataset : "scores(rename=score=s)", multidata : "Y");
h.definekey("variables");
h.definedata(all : "Y");
h.definedone();
dcl hiter hi("h");
end;
set main_data;
if 0 then set scores(rename=score=s);
score = 0;
do while (hi.next() = 0);
if variables = "Debt" and low <= Debt <= high then score + s;
else if variables = "MAX_NA" and low <= MAX_NA <= high then score + s;
end;
keep id Debt max_na score;
run;
结果:
ID Debt MAX_NA score
222554 7584 12 -217
212552 20 0 523
883123 500 7 -217
913464 -200 -78 555
另一种方法是像这样使用双左连接:
data scores;
length variables $ 16;
input variables $ low high score;
datalines;
Debt -10000 1 55
Debt 1 10000 23
MAX_NA -1 1 500
MAX_NA 1 100 -240
;
data main_data;
input ID Debt MAX_NA;
sortseq = _n_;
datalines;
222554 7584 12
212552 20 0
883123 500 7
913464 -200 -78
;
proc sql;
create table end_result as
select a.ID
,a.Debt
,a.MAX_NA
,coalesce(b.score,0) + coalesce(c.score,0) as score
from main_data as a
left join scores(where=(variables="Debt")) as b
on b.low < a.Debt <= b.high
left join scores(where=(variables="MAX_NA")) as c
on c.low < a.MAX_NA <= c.high
order by a.sortseq
;
quit;
请注意,我在 main_data 中包含了一个 sortseq 变量来保持排序顺序。
与 draycut 一样,我为 ID 222554 和 883123 获得了相同的分数。对于 ID 913464,MAX_NA 值超出了分数数据集的范围,因此我使用合并函数将其计为零。因此我得到了结果:
ID Debt MAX_NA score
222554 7584 12 -217
212552 20 0 523
883123 500 7 -217
913464 -200 -78 55
更简单:
data end_result(keep=ID Debt MAX_NA score);
set main_data;
score = 0;
do i = 1 to n;
set scores(rename=score=s) point=i nobs=n;
if variables = "Debt" and low <= Debt <= high then score + s;
else if variables = "MAX_NA" and low <= MAX_NA <= high then score + s;
end;
run;
data scores;
length variables $ 16;
input variables $ low high score;
datalines;
Debt -10000 1 55
Debt 1 10000 23
MAX_NA -1 1 500
MAX_NA 1 100 -240
;
data main_data;
input ID Debt MAX_NA;
datalines;
222554 7584 12
212552 20 0
883123 500 7
913464 -200 -78
;
data end_result;
input ID Debt MAX_NA score;
datalines;
222554 7584 12 -217
212552 20 0 523
883123 500 7 -185
913464 -200 -78 555
;
上面你会发现三个数据集。
scores 数据集描述了每个变量的得分,基于低列和高列之间的一系列值。
第二个数据集main_data显示了Debt和MAX_NA的确切值。
end_result table是我想达到的效果
我应该使用什么步骤和语句来计算分数并得到 end_result table?
我不明白为什么id 222554和883123没有得到相同的分数?
无论如何,这里有一个方法可以用作模板。
data end_result;
if _N_ = 1 then do;
dcl hash h(dataset : "scores(rename=score=s)", multidata : "Y");
h.definekey("variables");
h.definedata(all : "Y");
h.definedone();
dcl hiter hi("h");
end;
set main_data;
if 0 then set scores(rename=score=s);
score = 0;
do while (hi.next() = 0);
if variables = "Debt" and low <= Debt <= high then score + s;
else if variables = "MAX_NA" and low <= MAX_NA <= high then score + s;
end;
keep id Debt max_na score;
run;
结果:
ID Debt MAX_NA score
222554 7584 12 -217
212552 20 0 523
883123 500 7 -217
913464 -200 -78 555
另一种方法是像这样使用双左连接:
data scores;
length variables $ 16;
input variables $ low high score;
datalines;
Debt -10000 1 55
Debt 1 10000 23
MAX_NA -1 1 500
MAX_NA 1 100 -240
;
data main_data;
input ID Debt MAX_NA;
sortseq = _n_;
datalines;
222554 7584 12
212552 20 0
883123 500 7
913464 -200 -78
;
proc sql;
create table end_result as
select a.ID
,a.Debt
,a.MAX_NA
,coalesce(b.score,0) + coalesce(c.score,0) as score
from main_data as a
left join scores(where=(variables="Debt")) as b
on b.low < a.Debt <= b.high
left join scores(where=(variables="MAX_NA")) as c
on c.low < a.MAX_NA <= c.high
order by a.sortseq
;
quit;
请注意,我在 main_data 中包含了一个 sortseq 变量来保持排序顺序。 与 draycut 一样,我为 ID 222554 和 883123 获得了相同的分数。对于 ID 913464,MAX_NA 值超出了分数数据集的范围,因此我使用合并函数将其计为零。因此我得到了结果:
ID Debt MAX_NA score
222554 7584 12 -217
212552 20 0 523
883123 500 7 -217
913464 -200 -78 55
更简单:
data end_result(keep=ID Debt MAX_NA score);
set main_data;
score = 0;
do i = 1 to n;
set scores(rename=score=s) point=i nobs=n;
if variables = "Debt" and low <= Debt <= high then score + s;
else if variables = "MAX_NA" and low <= MAX_NA <= high then score + s;
end;
run;