使用 first 标记基于后续出现的值。保留等

Flagging values based on subsequent occurences using first. retain etc

谢谢谁能帮助我。我有一个数据集如下:

data smp;
infile datalines dlm=',';
informat identifier . trx_date . transaction_id . product_description . ;
input identifier $ trx_date transaction_id $ product_description $ ;
datalines;
Cust1,11Aug2016,20-0030417313,ONKEN BIOPOT F/FREE STRAWBERRY
Cust1,11Aug2016,20-0030417313,ONKEN BIOPOT F/FREE STRAWBERRY
Cust1,11Aug2016,20-0030417313,ONKEN BIOPOT FULL STRAWB/GRAIN
Cust1,11Aug2016,20-0030417313,RACHELS YOG GREEK NAT F/F/ORG
Cust1,03Nov2016,23-0040737060,RACHELS YOG GREEK NAT F/F/ORG
Cust3,13Feb2016,39-0070595440,COLLECT YOG LEMON
Cust3,21Jun2016,34-0050769524,AF YOG FARMHOUSE STRAWB/REDCUR
Cust3,21Jun2016,34-0050769524,Y/VALLEY GREEK HONEY ORGANIC
Cust3,21Jun2016,34-0050769524,Y/VALLEY THICK LEMON CURD ORG
Cust3,21Jun2016,34-0050769524,Y/VALLEY THICK YOG FRUITY FAVS
Cust3,21Jun2016,34-0050769524,Y/VALLEY THICK YOG STRAWB ORG
Cust3,26Jun2016,39-0430106897,TOTAL GREEK YOGURT 0%
Cust3,14Aug2016,54-0040266755,M/BUNCH SQUASHUMS STRAW/RASP
Cust3,14Aug2016,54-0040266755,MULLER CORNER STRAWBERRY
Cust3,14Aug2016,54-0040266755,TOTAL GREEK YOGURT 0%
Cust3,22Aug2016,54-0050447336,M/BUNCH SQUASHUMS STRAW/RASP
;

对于每位客户(以及他们基于 transaction_id 的每次购买),我想滚动标记将在他们下次访问(仅下次访问)期间重新购买的每个产品。所以在上面的数据集中,正确的标志将在第 4、12 和 13 行,因为这些产品是在下一次客户访问时购买的(我们只查看下一次访问)。

我正在尝试使用以下程序来完成:

proc sort data = smp out = td;
by descending identifier transaction_id product_description;
run;

DATA TD2(DROP=tmp_product);
SET td;
BY identifier transaction_id product_description;
RETAIN tmp_product;
IF FIRST.product_description and first.transaction_id THEN DO;
  tmp_product = product_description;
END;
ATTRIB repeat_flag FORMAT=.;
IF NOT FIRST.product_description THEN DO;
IF tmp_product EQ product_description THEN repeat_flag ='Y';
ELSE repeat_flag = 'N';
END;
RUN;

proc sort data = td2;
by descending identifier transaction_id product_description;
run;

但是它不起作用?如果有人可以提供帮助,那就太好了。 祝福

下面的方法将在排序后"look ahead"到下一行(与 LAG 相对),因此您可以将比较放在同一行上以获得简单的逻辑:

** convert character date to numeric **;
data smp1; set smp;
    TRX_DATE_NUM = input(TRX_DATE,ANYDTDTE10.);
    format TRX_DATE_NUM mmddyy10.;
run;

** sort **;
proc sort data = smp1;
    by IDENTIFIER PRODUCT_DESCRIPTION TRX_DATE_NUM;
run;

** look ahead at the next observations and use logic to identify flags **;
data look_ahead;
    set smp1;
    by IDENTIFIER;
    set smp1 (firstobs = 2 
                keep = IDENTIFIER PRODUCT_DESCRIPTION TRX_DATE_NUM 
                rename = (IDENTIFIER = NEXT_ID PRODUCT_DESCRIPTION = NEXT_PROD TRX_DATE_NUM = NEXT_DT))
        smp1 (obs = 1 drop = _ALL_);
    if last.IDENTIFIER then do;
        NEXT_ID = "";
        NEXT_PROD = "";
        NEXT_DT = .;
    end;
run;

** logic says if the next row is the same customer who bought the same product on a different date then flag **;
data look_ahead_final; set look_ahead;
    if IDENTIFIER = NEXT_ID and NEXT_PROD = PRODUCT_DESCRIPTION and TRX_DATE_NUM ne NEXT_DT then FLAG = 1;
        else FLAG = 0;
run;

有几种方法可以做到这一点;我认为最简单的理解,同时仍然具有合理的性能水平,是将数据按日期降序排序,然后使用数组存储最后一个 trx_date 的 product_descriptions。

这里我使用了一个二维数组,其中第一维只是一个1/2的值;每个 trx_date 同时加载数组的一行并检查数组的另一行(使用 _array_switch 确定哪个 loaded/checked)。

您可以使用散列 table 做同样的事情,而且速度会明显加快,而且在某些方面可能会更简单一些;如果您熟悉散列 tables 并希望查看解决方案评论,我或其他人可以提供。

您也可以使用 SQL 来执行此操作,我怀疑这是最常见的解决方案,但我无法完全使用它,因为子查询中的子查询具有一定的复杂性我接近它的方式,显然我对这些还不够好。

这是阵列解决方案。将 prods 的第二个维度设置为您的数据的合理最大值 - 它甚至可以是数千个,这是一个临时数组并且不会使用太多内存,因此设置为 32000 或其他不会有什么大不了的。

proc sort data=smp;
  by identifier descending trx_date ;
run;

data want;
  array prods[2,20] 5. _temporary_;
  retain _array_switch 2;
  do _n_ = 1 by 1 until (last.trx_date);
    set smp;
    by identifier descending trx_date;
    /* for first row for an identifier, clear out the whole thing */
    if first.identifier then do;
      call missing(of prods[*]);
    end;

    /* for first row of a trx_date, clear out the array-row we were looking at last time, and switch _array_switch to the other value */
    if first.trx_date then do;
      do _i = 1 to dim(prods,2);
        if missing(prods[_array_switch,_i]) then leave;
        call missing(prods[_array_switch,_i]);
      end;
      _array_switch = 3-_array_switch;
    end;

    *now check the array to see if we should set next_trans_flag;

    next_trans_flag='N';
    do _i = 1 to dim(prods,2);
      if missing(prods[_array_switch,_i]) then leave;  *for speed;
      if prods[_array_switch,_i] = product_description then next_trans_flag='Y';      
    end;
    prods[3-_array_switch,_n_] = product_description;  *set for next trx_date;
    output;
  end;
  drop _:;
run;

我认为要真正回答这个问题,您需要生成一个包含不同访问*产品组合的列表。以及特定访问时购买的不同产品的列表。

proc sql noprint ;
  create table bought as 
    select distinct identifier, product_description, trx_date, transaction_id
    from smp
    order by 1,2,3,4
  ;
  create table all_visits as 
    select a.identifier, product_description, trx_date, transaction_id
    from (select distinct identifier,product_description from bought) a
    natural join (select distinct identifier,transaction_id,trx_date from bought) b
    order by 1,2,3,4
  ;
quit;

然后您可以将它们组合起来并标记该产品是否是在那次访问中购买的。

data check ;
  merge all_visits bought(in=in1) ;
  by identifier product_description trx_date transaction_id ;
  bought=in1;
run;

您现在可以使用 lead 技术来确定他们是否在下次访问时也购买了该产品。

data flag ;
  set check ;
  by identifier product_description trx_date transaction_id ;
  set check(firstobs=2 keep=bought rename=(bought=bought_next)) check(drop=_all_ obs=1);
  if last.product_description then bought_next=0;
run;

然后您可以结合实际购买并消除多余的虚拟记录。

proc sort data=smp;
  by identifier product_description trx_date transaction_id ;
run;

data want ;
  merge flag smp (in=in1);
  by identifier product_description trx_date transaction_id ;
  if in1 ;
run;

让我们把记录放回原来的顺序,这样我们就可以检查结果了。

proc sort; by row; run;
proc print; run;

另一种方法是在原始数据集和临时数据集中产生一个虚拟组。在原始数据集中,组按每个客户的访问时间排序,在临时数据集中,组从每个客户的 SECOND 访问时间开始排序,临时数据集中的组编号与原始数据集中的组编号相同,但其访问时间在下一个访问原始数据集。使用虚拟组,很容易找到他们在下次访问时通过哈希 table 重新购买的相同产品。

proc sort data=smp;
by identifier trx_date;
run;

data have(drop=_group) temp(drop=group rename=(_group=group));
   set smp;
   by identifier trx_date; 
   if first.identifier then do;
     group=1; _group=0;
   end;
   if dif(trx_date)>0 then do;
      group+1; _group+1;
   end;
   if _group^=0 then output temp; 
   output have;
 run;

data want;
    if 0 then set temp;
    if _n_=1 then do;
       declare hash h(dataset:'temp');
       h.definekey('identifier','group','product_description');
       h.definedata('product_description');
       h.definedone();
    end;
    set have;
    flag=(h.find()=0);
    drop group;
run;