SAS:装箱数据

SAS: binning data

data scores;
    length variables $ 16;
    input variables $ low high score;
    datalines;
Debt -10000 1 55
Debt  1 10000 23
MAX_NA -1 1 500
MAX_NA 1 100 -240
;

data main_data;
    input ID Debt MAX_NA;
    datalines;
    222554 7584 12 
    212552 20 0 
    883123 500 7 
    913464 -200 -78  
;


data end_result;
    input ID Debt MAX_NA score;
    datalines;
    222554 7584 12 -217
    212552 20 0 523
    883123 500 7 -185 
    913464 -200 -78 555
;

上面你会发现三个数据集。

  1. scores 数据集描述了每个变量的得分,基于低列和高列之间的一系列值。

  2. 第二个数据集main_data显示了Debt和MAX_NA的确切值。

  3. end_result table是我想达到的效果

我应该使用什么步骤和语句来计算分数并得到 end_result table?

我不明白为什么id 222554和883123没有得到相同的分数?

无论如何,这里有一个方法可以用作模板。

data end_result;

   if _N_ = 1 then do;
      dcl hash h(dataset : "scores(rename=score=s)", multidata : "Y");
      h.definekey("variables");
      h.definedata(all : "Y");
      h.definedone();
      dcl hiter hi("h");
   end;

   set main_data;
   if 0 then set scores(rename=score=s);
   score = 0;

   do while (hi.next() = 0);
      if      variables = "Debt" and low <= Debt <= high then score + s;
      else if variables = "MAX_NA" and low <= MAX_NA <= high then score + s;
   end;

   keep id Debt max_na score;

run;

结果:

ID     Debt  MAX_NA score 
222554 7584  12     -217 
212552 20    0       523 
883123 500   7      -217 
913464 -200 -78      555 

另一种方法是像这样使用双左连接:

data scores;
    length variables $ 16;
    input variables $ low high score;
    datalines;
Debt -10000 1 55
Debt  1 10000 23
MAX_NA -1 1 500
MAX_NA 1 100 -240
;

data main_data;
    input ID Debt MAX_NA;
    sortseq = _n_;
    datalines;
    222554 7584 12 
    212552 20 0 
    883123 500 7 
    913464 -200 -78  
;


proc sql;
   create table end_result as 
      select a.ID 
            ,a.Debt
            ,a.MAX_NA
            ,coalesce(b.score,0) + coalesce(c.score,0) as score
      from main_data as a
      left join scores(where=(variables="Debt")) as b
         on b.low < a.Debt <= b.high
      left join scores(where=(variables="MAX_NA")) as c
         on c.low < a.MAX_NA <= c.high
      order by a.sortseq
   ;
quit;

请注意,我在 main_data 中包含了一个 sortseq 变量来保持排序顺序。 与 draycut 一样,我为 ID 222554 和 883123 获得了相同的分数。对于 ID 913464,MAX_NA 值超出了分数数据集的范围,因此我使用合并函数将其计为零。因此我得到了结果:

ID     Debt  MAX_NA score 
222554 7584  12     -217 
212552 20    0       523 
883123 500   7      -217 
913464 -200 -78      55 

更简单:

data end_result(keep=ID Debt MAX_NA score);
   set main_data;
   score = 0;
   do i = 1 to n;
      set scores(rename=score=s) point=i nobs=n;
          if      variables = "Debt" and low <= Debt <= high then score + s;
          else if variables = "MAX_NA" and low <= MAX_NA <= high then score + s;
   end;
run;