SAS 将 CDF 计算为值的百分比 <= 加权变量中任何定义的值

SAS calculate CDF as the percentage of values <= any defined values in a weighted variable

想法是,对于数据集中的加权变量 "leadb",从值 B 开始,然后找到 CDF 作为值 <= B 的百分比,以及该百分比的置信区间.

我可以使用 PROC SURVEYMEAN 对指定的任何单个值执行此操作,但我不知道如何让 SAS 同时给我多个百分比。如果我想计算0到max+1之间的值的百分比,间隔为(max+1)/100,我应该如何修改我的代码?

谢谢!

data Test;
  set data2012;
  if leadb <= 5 then lead5 = 1;
  else if leadb ne . then lead5 = 0;
  else lead5=.;
  if (gender = 2 and age >= 16 and age <= 49) then wocba = 1;
  else wocba = 0;
run;

proc surveymeans data=Test;
  strata stratum;
  cluster psu;
  weight weight2;
  var lead5;
  domain wocba; 
  ods output domain=mystats;
run; 

data mystats;
  set mystats;
  where wocba = 1;
  lower = max(lowerclmean,0); /* since lower bound might be zero, but proportion is >= 0 */
  upper = max(upperclmean,0); 
run;

proc print data=mystats;
  title "Proportion of blood lead values >= 5 for women of child-bearing age (16-49)";
  title2 "Weighted by rates of giving birth by age and race";
  title3 "With a 95% confidence interval";
run;

要计算考虑加权调查设计的累积分布函数,您需要执行几个步骤。

  1. 将数据按您想要的间隔分类。
  2. 运行 通过 PROC SURVEYFREQ 分箱的数据以获得每个间隔的加权百分比
  3. 使用加权百分比计算累计加权百分比
  4. 计算 95% 置信区间

(注意:我不确定使用 StdErr 估计的累积百分比的百分比是否合法。您必须自己决定。但如果是我,我会使用它。)

查看下面的代码。希望对您有所帮助!

*** GENERATE TEST DATA ***;
data have;
    do i=1 to 200;

        leadb = ranexp(123321) * 5;

        *** SURVEY VARIABLES ***;
        stratum = mod(i, 12) + 1;
        if ranuni(456654) > 0.5 then psu = 1;
        else psu = 2; 
        weight2 = ranuni(1991) * 1000;

        *** DOMAIN VARIABLE ***;
        if ranuni(789987) > 0.7 then wocba = 1;
        else wocba = 0;

        output;
    end;
run;


*** GET MIN/MAX ***;
proc summary data=have;
    var leadb;
    output out=stats min=min  max=max ;
run;


*** USE MIN/MAX TO CREATE INTERVALS TO BIN THE DATA BY APPLYING A FORMAT ***;
*** CREATE A CONTROL DATASET THAT WILL BE CONVERTED INTO A FORMAT ***;
data control_dset;
    set stats (drop=_type_ _freq_);
    min=floor(min);
    max=ceil(max);
    *** CALCULATE INTERVAL BASED ON MIN AND MAX OF DATA ***;
    interval = round( (max - min + 1)/100 , 0.1);

    fmtname = 'leadfmt';
    type = 'n';
    eexcl = 'Y';    *** END VALUE IS EXCLUDED FROM RANCE ***;
    do i = min to max by interval;
        start = i;
        end = i + interval;

        label = start;
        output;
    end;
run;

*** CONVERT CONTROL DATASET TO A FORMAT ***;
proc format cntlin=control_dset;
run;

*** APPLY FORMAT TO BIN THE DATA INTO INTERVALS ***;
data start;
    set have;
    lead_interval = put(leadb, leadfmt.) + 0;
run;


ODS TRACE ON / LISTING;

*** USE SURVEMEANS TO GET CUMULATIVE FREQUENCIES FOR BINNED CATEGORIES ***;
*** NOTE: SURVEYFREQ DOES -NOT- HAVE A DOMAIN STATEMENT ***;
*** INSTEAD, PUT DOMAIN VARIABLE IN TABLE STATEMENT AND THEN GET APPROPRIATE ROW OR COL PERCENT FROM OUTPUT ***;
proc surveyfreq data=start;
    ods output summary=summary; 
    ods output crosstabs=crosstabs; 
    strata stratum;
    cluster psu;
    weight weight2;
    *** USE THE DOMAIN / SUBPOPULATION VARIABLE IN THE TABLE STATEMENT ***; 
    tables wocba * lead_interval / row ;
run; 

ods trace off;


*** CALCULATE CUMULATIVE PERCENT ***;
data really_close;
    set crosstabs;
    retain CumRowPercent 0;

    *** SUBSET ROW PERCENTS FOR DOMAIN, ALSO DELETE IF COUNT = 0 ***;
    if wocba = 1 and strip(F_lead_interval) not= 'Total' and frequency > 0;

    *** CALCULATE CUMULATIVE PERCENT ***;
    CumRowPercent = sum(RowPercent, CumRowPercent);

    drop Percent StdErr StdDev;
run;


*** I AM NOT SURE HOW LEGITIMATE IT IS TO USE USE THE RowStdErr WITH THE CUMULATIVE ROW PERCENTS ***;
*** CONSULT YOUR FAVORITE STATISTICIAN FOR A FIRM OPINION!!! ***;

*** GET T-STATISTIC TO CALCULATE 95% CONFIDENCE INTERVAL ***;
data tstat;
    *** SUMMARY STATISTICS FROM PROC SURVEYFREQ ***;
    set summary end=lastrec;
    retain nclus nstrat;

    if index( upcase(Label1), 'STRATA') then nstrat = nvalue1;
    else if index( upcase(Label1), 'CLUSTER') then nclus = nvalue1;

    *** DEGREES OF FREEDOM = NUMBER OF CLUSTERS - NUMBER OF STRATA ***;
    df = nclus - nstrat;

    *** GET T-STATISTIC FOR 95% CONFIDENCE INTERVAL ***;
    tstat = abs( quantile('T', 0.05/2, df) );

    if lastrec;
    drop label1 cvalue1 nvalue1;
run;


*** CALCULATE 95% CI ***;
data want;
    set really_close ;
    if _N_ =1 then set tstat;

    CumRowPct_Lower = CumRowPercent - tstat * RowStdErr;
    CumRowPct_Upper = CumRowPercent + tstat * RowStdErr;

    if CumRowPct_Lower < 0 then CumRowPct_Lower = 0;
    if CumRowPct_Upper > 100 then CumRowPct_Upper = 100;

    keep lead_interval CumRowPercent CumRowPct_Lower CumRowPct_Upper;
run;