SAS 将 CDF 计算为值的百分比 <= 加权变量中任何定义的值
SAS calculate CDF as the percentage of values <= any defined values in a weighted variable
想法是,对于数据集中的加权变量 "leadb",从值 B 开始,然后找到 CDF 作为值 <= B 的百分比,以及该百分比的置信区间.
我可以使用 PROC SURVEYMEAN
对指定的任何单个值执行此操作,但我不知道如何让 SAS 同时给我多个百分比。如果我想计算0到max+1之间的值的百分比,间隔为(max+1)/100,我应该如何修改我的代码?
谢谢!
data Test;
set data2012;
if leadb <= 5 then lead5 = 1;
else if leadb ne . then lead5 = 0;
else lead5=.;
if (gender = 2 and age >= 16 and age <= 49) then wocba = 1;
else wocba = 0;
run;
proc surveymeans data=Test;
strata stratum;
cluster psu;
weight weight2;
var lead5;
domain wocba;
ods output domain=mystats;
run;
data mystats;
set mystats;
where wocba = 1;
lower = max(lowerclmean,0); /* since lower bound might be zero, but proportion is >= 0 */
upper = max(upperclmean,0);
run;
proc print data=mystats;
title "Proportion of blood lead values >= 5 for women of child-bearing age (16-49)";
title2 "Weighted by rates of giving birth by age and race";
title3 "With a 95% confidence interval";
run;
要计算考虑加权调查设计的累积分布函数,您需要执行几个步骤。
- 将数据按您想要的间隔分类。
- 运行 通过 PROC SURVEYFREQ 分箱的数据以获得每个间隔的加权百分比
- 使用加权百分比计算累计加权百分比
- 计算 95% 置信区间
(注意:我不确定使用 StdErr 估计的累积百分比的百分比是否合法。您必须自己决定。但如果是我,我会使用它。)
查看下面的代码。希望对您有所帮助!
*** GENERATE TEST DATA ***;
data have;
do i=1 to 200;
leadb = ranexp(123321) * 5;
*** SURVEY VARIABLES ***;
stratum = mod(i, 12) + 1;
if ranuni(456654) > 0.5 then psu = 1;
else psu = 2;
weight2 = ranuni(1991) * 1000;
*** DOMAIN VARIABLE ***;
if ranuni(789987) > 0.7 then wocba = 1;
else wocba = 0;
output;
end;
run;
*** GET MIN/MAX ***;
proc summary data=have;
var leadb;
output out=stats min=min max=max ;
run;
*** USE MIN/MAX TO CREATE INTERVALS TO BIN THE DATA BY APPLYING A FORMAT ***;
*** CREATE A CONTROL DATASET THAT WILL BE CONVERTED INTO A FORMAT ***;
data control_dset;
set stats (drop=_type_ _freq_);
min=floor(min);
max=ceil(max);
*** CALCULATE INTERVAL BASED ON MIN AND MAX OF DATA ***;
interval = round( (max - min + 1)/100 , 0.1);
fmtname = 'leadfmt';
type = 'n';
eexcl = 'Y'; *** END VALUE IS EXCLUDED FROM RANCE ***;
do i = min to max by interval;
start = i;
end = i + interval;
label = start;
output;
end;
run;
*** CONVERT CONTROL DATASET TO A FORMAT ***;
proc format cntlin=control_dset;
run;
*** APPLY FORMAT TO BIN THE DATA INTO INTERVALS ***;
data start;
set have;
lead_interval = put(leadb, leadfmt.) + 0;
run;
ODS TRACE ON / LISTING;
*** USE SURVEMEANS TO GET CUMULATIVE FREQUENCIES FOR BINNED CATEGORIES ***;
*** NOTE: SURVEYFREQ DOES -NOT- HAVE A DOMAIN STATEMENT ***;
*** INSTEAD, PUT DOMAIN VARIABLE IN TABLE STATEMENT AND THEN GET APPROPRIATE ROW OR COL PERCENT FROM OUTPUT ***;
proc surveyfreq data=start;
ods output summary=summary;
ods output crosstabs=crosstabs;
strata stratum;
cluster psu;
weight weight2;
*** USE THE DOMAIN / SUBPOPULATION VARIABLE IN THE TABLE STATEMENT ***;
tables wocba * lead_interval / row ;
run;
ods trace off;
*** CALCULATE CUMULATIVE PERCENT ***;
data really_close;
set crosstabs;
retain CumRowPercent 0;
*** SUBSET ROW PERCENTS FOR DOMAIN, ALSO DELETE IF COUNT = 0 ***;
if wocba = 1 and strip(F_lead_interval) not= 'Total' and frequency > 0;
*** CALCULATE CUMULATIVE PERCENT ***;
CumRowPercent = sum(RowPercent, CumRowPercent);
drop Percent StdErr StdDev;
run;
*** I AM NOT SURE HOW LEGITIMATE IT IS TO USE USE THE RowStdErr WITH THE CUMULATIVE ROW PERCENTS ***;
*** CONSULT YOUR FAVORITE STATISTICIAN FOR A FIRM OPINION!!! ***;
*** GET T-STATISTIC TO CALCULATE 95% CONFIDENCE INTERVAL ***;
data tstat;
*** SUMMARY STATISTICS FROM PROC SURVEYFREQ ***;
set summary end=lastrec;
retain nclus nstrat;
if index( upcase(Label1), 'STRATA') then nstrat = nvalue1;
else if index( upcase(Label1), 'CLUSTER') then nclus = nvalue1;
*** DEGREES OF FREEDOM = NUMBER OF CLUSTERS - NUMBER OF STRATA ***;
df = nclus - nstrat;
*** GET T-STATISTIC FOR 95% CONFIDENCE INTERVAL ***;
tstat = abs( quantile('T', 0.05/2, df) );
if lastrec;
drop label1 cvalue1 nvalue1;
run;
*** CALCULATE 95% CI ***;
data want;
set really_close ;
if _N_ =1 then set tstat;
CumRowPct_Lower = CumRowPercent - tstat * RowStdErr;
CumRowPct_Upper = CumRowPercent + tstat * RowStdErr;
if CumRowPct_Lower < 0 then CumRowPct_Lower = 0;
if CumRowPct_Upper > 100 then CumRowPct_Upper = 100;
keep lead_interval CumRowPercent CumRowPct_Lower CumRowPct_Upper;
run;
想法是,对于数据集中的加权变量 "leadb",从值 B 开始,然后找到 CDF 作为值 <= B 的百分比,以及该百分比的置信区间.
我可以使用 PROC SURVEYMEAN
对指定的任何单个值执行此操作,但我不知道如何让 SAS 同时给我多个百分比。如果我想计算0到max+1之间的值的百分比,间隔为(max+1)/100,我应该如何修改我的代码?
谢谢!
data Test;
set data2012;
if leadb <= 5 then lead5 = 1;
else if leadb ne . then lead5 = 0;
else lead5=.;
if (gender = 2 and age >= 16 and age <= 49) then wocba = 1;
else wocba = 0;
run;
proc surveymeans data=Test;
strata stratum;
cluster psu;
weight weight2;
var lead5;
domain wocba;
ods output domain=mystats;
run;
data mystats;
set mystats;
where wocba = 1;
lower = max(lowerclmean,0); /* since lower bound might be zero, but proportion is >= 0 */
upper = max(upperclmean,0);
run;
proc print data=mystats;
title "Proportion of blood lead values >= 5 for women of child-bearing age (16-49)";
title2 "Weighted by rates of giving birth by age and race";
title3 "With a 95% confidence interval";
run;
要计算考虑加权调查设计的累积分布函数,您需要执行几个步骤。
- 将数据按您想要的间隔分类。
- 运行 通过 PROC SURVEYFREQ 分箱的数据以获得每个间隔的加权百分比
- 使用加权百分比计算累计加权百分比
- 计算 95% 置信区间
(注意:我不确定使用 StdErr 估计的累积百分比的百分比是否合法。您必须自己决定。但如果是我,我会使用它。)
查看下面的代码。希望对您有所帮助!
*** GENERATE TEST DATA ***;
data have;
do i=1 to 200;
leadb = ranexp(123321) * 5;
*** SURVEY VARIABLES ***;
stratum = mod(i, 12) + 1;
if ranuni(456654) > 0.5 then psu = 1;
else psu = 2;
weight2 = ranuni(1991) * 1000;
*** DOMAIN VARIABLE ***;
if ranuni(789987) > 0.7 then wocba = 1;
else wocba = 0;
output;
end;
run;
*** GET MIN/MAX ***;
proc summary data=have;
var leadb;
output out=stats min=min max=max ;
run;
*** USE MIN/MAX TO CREATE INTERVALS TO BIN THE DATA BY APPLYING A FORMAT ***;
*** CREATE A CONTROL DATASET THAT WILL BE CONVERTED INTO A FORMAT ***;
data control_dset;
set stats (drop=_type_ _freq_);
min=floor(min);
max=ceil(max);
*** CALCULATE INTERVAL BASED ON MIN AND MAX OF DATA ***;
interval = round( (max - min + 1)/100 , 0.1);
fmtname = 'leadfmt';
type = 'n';
eexcl = 'Y'; *** END VALUE IS EXCLUDED FROM RANCE ***;
do i = min to max by interval;
start = i;
end = i + interval;
label = start;
output;
end;
run;
*** CONVERT CONTROL DATASET TO A FORMAT ***;
proc format cntlin=control_dset;
run;
*** APPLY FORMAT TO BIN THE DATA INTO INTERVALS ***;
data start;
set have;
lead_interval = put(leadb, leadfmt.) + 0;
run;
ODS TRACE ON / LISTING;
*** USE SURVEMEANS TO GET CUMULATIVE FREQUENCIES FOR BINNED CATEGORIES ***;
*** NOTE: SURVEYFREQ DOES -NOT- HAVE A DOMAIN STATEMENT ***;
*** INSTEAD, PUT DOMAIN VARIABLE IN TABLE STATEMENT AND THEN GET APPROPRIATE ROW OR COL PERCENT FROM OUTPUT ***;
proc surveyfreq data=start;
ods output summary=summary;
ods output crosstabs=crosstabs;
strata stratum;
cluster psu;
weight weight2;
*** USE THE DOMAIN / SUBPOPULATION VARIABLE IN THE TABLE STATEMENT ***;
tables wocba * lead_interval / row ;
run;
ods trace off;
*** CALCULATE CUMULATIVE PERCENT ***;
data really_close;
set crosstabs;
retain CumRowPercent 0;
*** SUBSET ROW PERCENTS FOR DOMAIN, ALSO DELETE IF COUNT = 0 ***;
if wocba = 1 and strip(F_lead_interval) not= 'Total' and frequency > 0;
*** CALCULATE CUMULATIVE PERCENT ***;
CumRowPercent = sum(RowPercent, CumRowPercent);
drop Percent StdErr StdDev;
run;
*** I AM NOT SURE HOW LEGITIMATE IT IS TO USE USE THE RowStdErr WITH THE CUMULATIVE ROW PERCENTS ***;
*** CONSULT YOUR FAVORITE STATISTICIAN FOR A FIRM OPINION!!! ***;
*** GET T-STATISTIC TO CALCULATE 95% CONFIDENCE INTERVAL ***;
data tstat;
*** SUMMARY STATISTICS FROM PROC SURVEYFREQ ***;
set summary end=lastrec;
retain nclus nstrat;
if index( upcase(Label1), 'STRATA') then nstrat = nvalue1;
else if index( upcase(Label1), 'CLUSTER') then nclus = nvalue1;
*** DEGREES OF FREEDOM = NUMBER OF CLUSTERS - NUMBER OF STRATA ***;
df = nclus - nstrat;
*** GET T-STATISTIC FOR 95% CONFIDENCE INTERVAL ***;
tstat = abs( quantile('T', 0.05/2, df) );
if lastrec;
drop label1 cvalue1 nvalue1;
run;
*** CALCULATE 95% CI ***;
data want;
set really_close ;
if _N_ =1 then set tstat;
CumRowPct_Lower = CumRowPercent - tstat * RowStdErr;
CumRowPct_Upper = CumRowPercent + tstat * RowStdErr;
if CumRowPct_Lower < 0 then CumRowPct_Lower = 0;
if CumRowPct_Upper > 100 then CumRowPct_Upper = 100;
keep lead_interval CumRowPercent CumRowPct_Lower CumRowPct_Upper;
run;