SAS层次结构总和

Question

我有一个带有分层编码表变量的数据集。层级逻辑由LEVEL变量和CODE字符变量的前缀结构决定。有6个（码长从1到6）"aggregate"级和终端级（码长10个字符）

我需要更新节点变量（终端节点的计数 - 聚合级别不计入 "higher" 聚合，仅计入终端节点） - 例如，一个级别的计数总和每个 5 级的总计数与每个 6 级的总计数相同。我需要计算（总结）"higher" 级节点的权重。

注意：我偏移了输出 table 的 NODES 和 WEIGHT 变量，以便您可以更好地理解我在说什么（只需将每个偏移量中的数字相加即可得到相同的值）。

EDIT1：同一代码可以有多个观察结果。一个独特的观察结果是 3 个变量 code + var1 + var2 的组合。

输入table:

ID   level code         var1  var2  nodes  weight  myIndex
1    1     1            .     .     999    999     999
2    2     11           .     .     999    999     999
3    3     111          .     .     999    999     999
4    4     1111         .     .     999    999     999
5    5     11111        .     .     999    999     999
6    6     111111       .     .     999    999     999
7   10     1111119999   01    1     1      0.1     105,5
8   10     1111119999   01    2     1      0.1     109,1
9    6     111112       .     .     999    999     999
10  10     1111120000   01    1     1      0.5      95,0
11   5     11119        .     .     999    999     999
12   6     111190       .     .     999    999     999
13  10     1111901000   01    1     1      0.1      80,7
14  10     1111901000   02    1     1      0.2     105,5

期望的输出table：

ID   level code         var1  var2  nodes    weight              myIndex
1    1     1            .     .     5        1.0                  98,1
2    2     11           .     .     5        1.0                  98,1
3    3     111          .     .     5        1.0                  98,1
4    4     1111         .     .     5        1.0                  98,1
5    5     11111        .     .       3          0.7              98,5
6    6     111111       .     .         2            0.2         107,3
7   10     1111119999   01    1           1               0.1    105,5  
8   10     1111119999   01    2           1               0.1    109,1
9    6     111112       .     .         1            0.5          95,0
10  10     1111120000   01    1           1               0.5     95,0
11   5     11119        .     .       2          0.3              97,2
12   6     111190       .     .         2            0.3          97,2
13  10     1111901000   01    1           1               0.1     80,7
14  10     1111901000   02    1           1               0.2    105,5

这是我想出的代码。它就像我想要的那样工作，但是伙计，它真的很慢。我需要更快的方法，因为这是网络服务的一部分，必须根据要求运行 "instantly" 。欢迎任何关于加速代码或任何其他解决方案的建议。

%macro doit;

data temporary;
    set have;
run;

%do i=6 %to 2 %by -1;
    %if &i = 6 %then %let x = 10;
    %else %let x = (&i+1);

    proc sql noprint;
        select count(code)
        into :cc trimmed
        from have
        where level = &i;

        select code
        into :id1 - :id&cc
        from have
        where level = &i;
    quit;

    %do j=1 %to &cc.;

        %let idd = &&id&j;

        proc sql;
        update have t1
            set nodes = (
                       select sum(nodes)
                       from temporary t2
                       where t2.level = &x and t2.code like ("&idd" || "%")),
            set weight = (
                       select sum(weight)
                       from temporary t2
                       where t2.level = &x and t2.code like ("&idd" || "%"))   
            where (t1.level = &i and t1.code like "&idd");
        quit;
    %end;
%end;
%mend doit;

基于@Quentin 解决方案的当前代码：

data have;
input ID level code : . nodes weight myIndex;
cards;
1    1  1            .   .    .
2    2  11           .   .    .
3    3  111          .   .    .
4    4  1111         .   .    .
5    5  11111        .   .    .
6    6  111111       .   .    .
7   10  1111110000   1   0.1  105.5
8   10  1111119999   1   0.1  109.1
9    6  111112       .   .    .
10  10  1111129999   1   0.5  95.0
11   5  11119        .   .    .
12   6  111190       .   .    .
13  10  1111900000   1   0.1  80.7
14  10  1111901000   1   0.2  105.5
;

data want (drop=_:);

    *hash table of terminal nodes;
    if (_n_ = 1) then do;
        if (0) then set have (rename=(code=_code weight=_weight));
        declare hash h(dataset:'have(where=(level=10) rename=(code=_code weight=_weight myIndex=_myIndex))');
        declare hiter iter('h');
        h.definekey('ID');
        h.definedata('_code','_weight','_myIndex');
        h.definedone();
    end;

    set have;

    *for each non-terminal node, iterate through;
    *hash table of all terminal nodes, looking for children;
    if level ne 10 then do;
        call missing(weight, nodes, myIndex);

        do _n_ = iter.first() by 0 while (_n_ = 0);
            if trim(code) =: _code then do;  
                weight=sum(weight,_weight);
                nodes=sum(nodes,1);
                myIndex=sum(myIndex,_myIndex*_weight);
            end;
            _n_ = iter.next();
        end;
        myIndex=round(myIndex/weight,.1);
    end;
    output;
run;

Answer 1

一种方法（我认为）是制作笛卡尔积，并找到与每个节点 "match" 相关的所有终端节点，然后对权重求和。

类似于：

data have;
  input ID level code : . nodes weight ;
  cards;
1    1  1            .   .
2    2  11           .   .
3    3  111          .   .
4    4  1111         .   .
5    5  11111        .   .
6    6  111111       .   .
7   10  1111110000   1   0.1
8   10  1111119999   1   0.1
9    6  111112       .   .
10  10  1111129999   1   0.5
11   5  11119        .   .
12   6  111190       .   .
13  10  1111900000   1   0.1
14  10  1111901000   1   0.2
;


proc sql;
  select min(id) as id
       , min(level) as level 
       , a.code
       , count(b.weight) as nodes   /*count of terminal nodes*/
       , sum(b.weight) as weight    /*sum of weights of terminal nodes*/
    from 
      have as a 
     ,(select code , weight
       from have
       where level=10   /*selects terminal nodes*/
       ) as b
    where a.code eqt b.code        /*EQT is equivalent to =: */
    group by a.code
  ;
quit;

我不确定这是否正确，但它给出了示例数据所需的结果。

Answer 2

下面是一种蛮力哈希方法，用于执行与 SQL 中类似的笛卡尔积。加载终端节点的哈希 table。然后读取节点数据集，对于每个不是终端节点的节点，遍历散列table，识别所有子终端节点。

我认为@joop 描述的方法可能更有效，因为这种方法没有利用树结构。所以有很多重新计算。对于 5000 条记录和 3000 个终端节点，这将进行 2000*3000 次比较。但可能不会那么慢，因为哈希 table 在内存中，所以你不会有过多的 I/O ....

data want (drop=_:);

   *hash table of terminal nodes;
   if (_n_ = 1) then do;
      if (0) then set have (rename=(code=_code weight=_weight));
      declare hash h(dataset:'have(where=(level=10) rename=(code=_code weight=_weight))');
      declare hiter iter('h');
      h.definekey('ID');
      h.definedata('_code','_weight');
      h.definedone();
   end;

   set have;

   *for each non-terminal node, iterate through;
   *hash table of all terminal nodes, looking for children;
   if level ne 10 then do;
      call missing(weight, nodes);

      do _n_ = iter.first() by 0 while (_n_ = 0);
         if trim(code) =: _code then do;  
           weight=sum(weight,_weight);
           nodes=sum(nodes,1);
         end;
         _n_ = iter.next();
      end;
   end;
   output;
run;

Answer 3

这是估计每条记录的 parent 条记录所需的 SQL。它只使用字符串函数（位置和长度），所以它应该适应 table 到 SQL 的任何方言，甚至可能是 SAS。（CTE 可能需要重写为子查询或视图）想法是：

向数据集添加一个 parent_id 字段
找出代码子串最长的记录
并使用它的 id 作为我们 parent_id
（在那之后）从直接 children 的 sum(nodes),sum(weight) 更新记录（那些 child.parent_id = this.id )

顺便说一句：我本可以使用 LEVEL 而不是 LENGTH(code) ；这方面的数据有点冗余。

WITH sub AS (
        SELECT id, length(code) AS len
        , code
        FROM tree)
UPDATE tree t
SET parent_id = s.id
FROM sub s
WHERE length(t.code) > s.len AND POSITION (s.code IN t.code) = 1
AND NOT EXISTS (
        SELECT *
        FROM sub nx
        WHERE nx.len > s.len AND POSITION (nx.code IN t.code ) = 1
        AND nx.len < length(t.code) AND POSITION (nx.code IN t.code ) = 1
        )
        ;

SELECT * FROM tree
ORDER BY parent_id DESC NULLS LAST
        , id
        ;

找到 parent 之后，整个 table 应该从自身更新（重复）喜欢：

-- PREPARE omg( integer) AS
UPDATE tree  t
SET nodes = s.nodes ,  weight = s.weight
FROM ( SELECT parent_id , SUM(nodes) AS nodes , SUM(weight) AS weight
        FROM tree GROUP BY parent_id) s
WHERE s.parent_id = t.id
        ;

在 SAS 中，这可能通过对 {0-parent_id, id} 进行排序并执行一些保留+求和魔术来完成。（我的SAS在这方面有点生疏）

更新：如果只有叶节点有 non-NULL (non-missing) {nodes, weight} 的值，聚合可以在整个树的一次扫描中完成，而无需首先计算parent_id秒：

UPDATE tree  t
SET nodes = s.nodes ,  weight = s.weight
FROM ( SELECT p.id , SUM(c.nodes) AS nodes , SUM(c.weight) AS weight
        FROM tree p
        JOIN tree c ON c.lev > p.lev AND POSITION (p.code IN c.code ) = 1
        GROUP BY p.id
        ) s
WHERE s.id = t.id
        ;

{lev,code} 上的索引可能会加快速度。（假设 id 上有一个索引）

Answer 4

看起来很简单。只需加入自己和 count/sum.

proc sql ;
create table want as
 select a.id, a.level, a.code , a.var1, a.var2
      , count(b.id) as nodes
      , sum(b.weight) as weight
 from have a
 left join have b
 on a.code eqt b.code
 and b.level=10
 group by 1,2,3,4,5
 order by 1
;
quit;

如果您不想使用 EQT 运算符，则可以改用 SUBSTR() 函数。

 on a.code = substr(b.code,1,a.level)
 and b.level=10

Answer 5

既然您使用的是 SAS，那么使用 proc summary 来完成这里的繁重工作怎么样？不需要笛卡尔连接！

与其他一些选项相比，此选项的一个优点是，如果您想为多个变量计算大量更复杂的统计数据，则它更容易概括。

data have;
input ID level code : . nodes weight myIndex;
format myIndex 5.1;
cards;
1    1  1            .   .    .
2    2  11           .   .    .
3    3  111          .   .    .
4    4  1111         .   .    .
5    5  11111        .   .    .
6    6  111111       .   .    .
7   10  1111110000   1   0.1  105.5
8   10  1111119999   1   0.1  109.1
9    6  111112       .   .    .
10  10  1111129999   1   0.5  95.0
11   5  11119        .   .    .
12   6  111190       .   .    .
13  10  1111900000   1   0.1  80.7
14  10  1111901000   1   0.2  105.5
;
run;


data v_have /view = v_have;
  set have(where = (level = 10));
  array lvl[6] ;
  do i = 1 to 6;
    lvl[i]=substr(code,1,i);
  end;
  drop i;
run;

proc summary data = v_have;
  class lvl1-lvl6;
  var nodes weight;
  var myIndex /weight = weight;
  ways 1;
  output out = summary(drop = _:) sum(nodes weight)= mean(myIndex)=;
run;

data v_summary /view = v_summary;
  set summary;
  length code ;
  code = cats(of lvl:);
  drop lvl:;
run;

data have;
  modify have v_summary;
  by code;
  replace;
run;

理论上，哈希的哈希也可能是一种合适的数据结构，但为了获得相对较小的收益，这将是极其复杂的。无论如何我可能会去作为一个学习练习......

Answer 6

这是另一种哈希方法。

这不是使用散列对象进行笛卡尔连接，而是将节点和权重从每个级别 10 节点添加到 6 个适用的父节点中的每一个。这可能比 Quentin 的方法稍微快一点，因为没有冗余的哈希查找。

在构造散列对象时，它比 Quentin 的方法花费的时间长一点，并且使用更多的内存，因为每个终端节点使用不同的键添加 6 次，并且通常必须更新现有条目，但之后每个父节点节点只需要查找自己的个人统计数据，而不是循环遍历所有终端节点，这是一个很大的节省。

加权统计也是可能的，但你必须更新两个循环，而不仅仅是第二个循环。

data want;
if 0 then set have;
dcl hash h();
h.definekey('code');
h.definedata('nodes','weight','myIndex');
h.definedone();
length t_code ;
do until(eof);
  set have(where = (level = 10)) end = eof;
  t_nodes = nodes;
  t_weight = weight;
  t_myindex = weight * myIndex;
  do _n_ = 1 to 6;
    t_code = substr(code,1,_n_);
    if h.find(key:t_code) ne 0 then h.add(key:t_code,data:t_nodes,data:t_weight,data:t_myIndex);
    else do;
      nodes + t_nodes;
      weight + t_weight;
      myIndex + t_myIndex;
      h.replace(key:t_code,data:nodes,data:weight,data:MyIndex);
    end;
  end;
end;
do until(eof2);
  set have end = eof2;
  if level ne 10 then do;
    h.find();
    myIndex = round(MyIndex / Weight,0.1);
  end;
  output;
end;
drop t_:;
run;

SAS层次结构总和

SAS hierarchical structure sum

sql

sum

sas

hierarchy

recursive-query