在 SAS 中批量转换变量
Transforming variables en masse in SAS
我一直在寻找相关的问题,但到目前为止我还没有找到答案。我希望转换一长串自变量以进行回归分析。虚拟数据集如下所示:
DATA TEST (DROP = i);
DO i = 1 to 4000;
VAR = i + 100000;
output;
end;
run;
PROC TRANSPOSE
DATA = TEST
OUT = TEST_T
(DROP = _NAME_)
PREFIX = X_;
ID VAR;
VAR VAR;
RUN;
DATA TEST_ARRAY;
SET TEST_T;
ARRAY X[*] X_:;
DO J = 1 TO 40;
DO I = 1 TO DIM(X);
X[I] = RANUNI(0)*I;
OUTPUT;
END;
END;
RUN;
在这种情况下,变量名称 X_i 单调递增,实际上,我的变量实际上是 X_number,其中数字是六位唯一标识符。我一直在尝试记录所有这些变量的变换和平方,以便我有一个包含以下列的新 X 矩阵
X_133456 X_SQ_133456 LOG_X_133456
我试过像这样遍历所有变量的列表
PROC CONTENTS
DATA = TEST_ARRAY
OUT = CONTENTS;
RUN;
PROC SQL NOPRINT;
SELECT NAME INTO: REG_FACTORS
SEPARATED BY " "
FROM CONTENTS;
QUIT;
DATA WANT;
SET TEST_ARRAY;
%LET index = 1;
%DO %UNTIL (%SCAN(®_factors.,&index.," ")=);
%LET factors = %SCAN(®_factors.,&index.," ");
LOG_X_&FACTORS. = LOG(X_&FACTORS.);
X_SQ_&FACTORS. = (X_&FACTORS.) ** 2;
%LET index = %EVAL(&Index + 1);
%END;
RUN;
但这会炸毁我的服务器,我需要找到一种更有效的方法,在此先感谢
编辑:对于贡献者 - 我设法解决了 13:04
%LET input_factors = X_:;
PROC SQL;
SELECT
NAME
, TRANWRD(NAME,%SCAN(&input_factors.,1,'_'),'SQ')
, TRANWRD(NAME,%SCAN(&input_factors.,1,'_'),'LOG')
INTO
:factor_list separated by " "
, :sq_factor_list separated by " "
, :log_factor_list separated by " "
FROM
contents
WHERE
VARNUM < 5
WHERE
NAME LIKE "%SCAN(&input_factors.,1,'_')_"
ORDER BY
INPUT(SCAN(NAME,-1,'_'),8.)
;
QUIT;
%PUT &factor_list.;
%PUT &sq_factor_list.;
%PUT &log_factor_list.;
我已经同意了,但是,我认为可以在批准答案之前讨论效率
%LET TRANSFORM_Y = NO;
%LET TRANSFORM_X_SQ = YES;
%LET TRANSFORM_LOG = YES;
%MACRO TEST;
DATA TEST (DROP = i);
DO i = 1 to 40000;
VAR = i + 100000;
output;
end;
run;
PROC TRANSPOSE
DATA = TEST
OUT = TEST_T
(DROP = _NAME_)
PREFIX = X_;
ID VAR;
VAR VAR;
RUN;
DATA TEST_ARRAY;
SET TEST_T;
ARRAY X[*] X_:;
DO I = 1 TO DIM(X);
X[I] = RANUNI(0)*I;
OUTPUT;
END;
RUN;
DATA TEST_ARRAY_2;
SET TEST_ARRAY;
Y = RANUNI(0);
DROP I J;
ROW_NUM = _N_;
RUN;
PROC TRANSPOSE
DATA = TEST_ARRAY_2
(DROP = ROW_NUM)
OUT = TEST_ARRAY_T
;
RUN;
%IF &TRANSFORM_X_SQ. = YES %THEN %DO;
DATA TESTING_X_SQ
(DROP = I);
SET TEST_ARRAY_T;
ARRAY COL[*] COL:;
DO I = 1 TO DIM(COL);
COL(I) = COL(I)**2;
END;
Row_num = _N_;
RUN;
PROC TRANSPOSE
DATA = TESTING_X_SQ
OUT = X_SQ_T
(DROP = _NAME_)
PREFIX = SQ_
;
ID _NAME_
;
RUN;
DATA X_SQ_T_2;
SET X_SQ_T;
ROW_NUM = _N_;
RUN;
%END;
%IF &TRANSFORM_LOG. = YES %THEN %DO;
DATA TESTING_LOG;
SET TEST_ARRAY_T;
ARRAY COL[*] COL:;
DO I = 1 TO DIM(COL);
COL(I) = LOG(COL(I));
END;
RUN;
PROC TRANSPOSE
DATA = TESTING_LOG
OUT = LOG_T
PREFIX = LOG_
;
ID _NAME_
;
RUN;
DATA LOG_T_2;
SET LOG_T;
ROW_NUM = _N_;
RUN;
%END;
PROC SQL;
CREATE TABLE FULL_DATA AS
SELECT
f.*
%IF &TRANSFORM_X_SQ. = YES %THEN %DO;
, x.*
%END;
%IF &TRANSFORM_LOG. = YES %THEN %DO;
, l.*
%END;
FROM
TEST_ARRAY_2 f
%IF &TRANSFORM_X_SQ. = YES %THEN %DO;
LEFT JOIN
X_SQ_T_2 x ON f.row_num = x.row_num
%END;
%IF &TRANSFORM_LOG. = YES %THEN %DO;
LEFT JOIN
LOG_T_2 l ON l.row_num = x.row_num
%END;
;
QUIT;
%MEND;
%TEST;
使用 3 个数组,一个用于输入值(例如 X_31415
),两个用于新计算值(对数和平方)。
技巧是根据原始变量名动态生成计算变量的变量名。
/* Use dictionary table to get/generate vnames */
proc sql ;
select name, /* X_31415 */
tranwrd(name,'X_','X_SQ_'), /* X_SQ_31415 */
tranwrd(name,'X_','LOG_X_') /* LOG_X_31415 */
into :VARLIST separated by ' ',
:SQLIST separated by ' ',
:LOGLIST separated by ' '
from dictionary.columns
where libname = 'WORK'
and memname = 'MYDATA'
and name like 'X_%'
order by input(scan(name,-1,'_'),8.) /* order based on the numeric suffix */
;
quit ;
现在您可以分配三个数组,循环输入值并相应地计算平方和对数。
data array3 ;
set mydata ;
array in{*} &VARLIST ; /* X_1 X_17 X_31415 X_99999 */
array sq{*} &SQLIST ; /* X_SQ_1 X_SQ_17 X_SQ_31415 X_SQ_99999 */
array lg{*} &LOGLIST ; /* LOG_X_1 LOG_X_17 LOG_X_31415 LOG_X_99999 */
do i = 1 to dim(in) ;
sq{i} = in{i} ** 2 ;
lg{i} = log(in{i}) ;
end ;
drop i ;
run ;
当变量较多时,可能需要使用SAS File I/O函数对变量进行迭代。此示例创建一个包含 55,000 个响应变量的数据集,并计算它们的平方和对数变换。
%macro make_have(nvar=10);
%local dsid suffix;
data cols;
do index = 100000 to 999999;
if ranuni(123) < &nvar / (1e6-1e5) then output;
end;
run;
data have;
do id = 1 to 10;
sex = ceil(2*ranuni(123));
age = 17 + ceil(52*ranuni(123));
weight = 150 + ceil(100*ranuni(123));
%let dsid = %sysfunc(open (cols));
%do %while (0 = %sysfunc(fetch(&dsid)));
%let suffix = %sysfunc(getvarn(&dsid,1));
x_&suffix = ranuni(123);
%end;
%let dsid = %sysfunc(close(&dsid));
output;
end;
run;
%mend;
options nomprint;
%make_have(nvar=55000);
%macro make_transforms(data=, vars=, new=, function=);
%local dsid i nvar varname;
%let dsid = %sysfunc(open (&data));
%do i = 1 %to %sysfunc(attrn(&dsid,nvar));
%let varname = %sysfunc(varname(&dsid,&i));
%if %substr(&varname,1,%length(&vars)) = &vars %then %do;
&new.%substr(&varname,%length(&vars)+1) = %sysfunc(tranwrd(&function,#,&varname));
%end;
%end;
%let dsid = %sysfunc(close(&dsid));
%mend;
data want;
set have;
%let t0 = %sysfunc(datetime());
%make_transforms(data=have, vars=x_, new=x_sq_, function=#**2)
%make_transforms(data=have, vars=x_, new=x_log_, function=log(#))
%put NOTE: codegen elapsed: %sysevalf(%sysfunc(datetime())-&t0);
run;
我一直在寻找相关的问题,但到目前为止我还没有找到答案。我希望转换一长串自变量以进行回归分析。虚拟数据集如下所示:
DATA TEST (DROP = i);
DO i = 1 to 4000;
VAR = i + 100000;
output;
end;
run;
PROC TRANSPOSE
DATA = TEST
OUT = TEST_T
(DROP = _NAME_)
PREFIX = X_;
ID VAR;
VAR VAR;
RUN;
DATA TEST_ARRAY;
SET TEST_T;
ARRAY X[*] X_:;
DO J = 1 TO 40;
DO I = 1 TO DIM(X);
X[I] = RANUNI(0)*I;
OUTPUT;
END;
END;
RUN;
在这种情况下,变量名称 X_i 单调递增,实际上,我的变量实际上是 X_number,其中数字是六位唯一标识符。我一直在尝试记录所有这些变量的变换和平方,以便我有一个包含以下列的新 X 矩阵
X_133456 X_SQ_133456 LOG_X_133456
我试过像这样遍历所有变量的列表
PROC CONTENTS
DATA = TEST_ARRAY
OUT = CONTENTS;
RUN;
PROC SQL NOPRINT;
SELECT NAME INTO: REG_FACTORS
SEPARATED BY " "
FROM CONTENTS;
QUIT;
DATA WANT;
SET TEST_ARRAY;
%LET index = 1;
%DO %UNTIL (%SCAN(®_factors.,&index.," ")=);
%LET factors = %SCAN(®_factors.,&index.," ");
LOG_X_&FACTORS. = LOG(X_&FACTORS.);
X_SQ_&FACTORS. = (X_&FACTORS.) ** 2;
%LET index = %EVAL(&Index + 1);
%END;
RUN;
但这会炸毁我的服务器,我需要找到一种更有效的方法,在此先感谢
编辑:对于贡献者 - 我设法解决了 13:04
%LET input_factors = X_:;
PROC SQL;
SELECT
NAME
, TRANWRD(NAME,%SCAN(&input_factors.,1,'_'),'SQ')
, TRANWRD(NAME,%SCAN(&input_factors.,1,'_'),'LOG')
INTO
:factor_list separated by " "
, :sq_factor_list separated by " "
, :log_factor_list separated by " "
FROM
contents
WHERE
VARNUM < 5
WHERE
NAME LIKE "%SCAN(&input_factors.,1,'_')_"
ORDER BY
INPUT(SCAN(NAME,-1,'_'),8.)
;
QUIT;
%PUT &factor_list.;
%PUT &sq_factor_list.;
%PUT &log_factor_list.;
我已经同意了,但是,我认为可以在批准答案之前讨论效率
%LET TRANSFORM_Y = NO;
%LET TRANSFORM_X_SQ = YES;
%LET TRANSFORM_LOG = YES;
%MACRO TEST;
DATA TEST (DROP = i);
DO i = 1 to 40000;
VAR = i + 100000;
output;
end;
run;
PROC TRANSPOSE
DATA = TEST
OUT = TEST_T
(DROP = _NAME_)
PREFIX = X_;
ID VAR;
VAR VAR;
RUN;
DATA TEST_ARRAY;
SET TEST_T;
ARRAY X[*] X_:;
DO I = 1 TO DIM(X);
X[I] = RANUNI(0)*I;
OUTPUT;
END;
RUN;
DATA TEST_ARRAY_2;
SET TEST_ARRAY;
Y = RANUNI(0);
DROP I J;
ROW_NUM = _N_;
RUN;
PROC TRANSPOSE
DATA = TEST_ARRAY_2
(DROP = ROW_NUM)
OUT = TEST_ARRAY_T
;
RUN;
%IF &TRANSFORM_X_SQ. = YES %THEN %DO;
DATA TESTING_X_SQ
(DROP = I);
SET TEST_ARRAY_T;
ARRAY COL[*] COL:;
DO I = 1 TO DIM(COL);
COL(I) = COL(I)**2;
END;
Row_num = _N_;
RUN;
PROC TRANSPOSE
DATA = TESTING_X_SQ
OUT = X_SQ_T
(DROP = _NAME_)
PREFIX = SQ_
;
ID _NAME_
;
RUN;
DATA X_SQ_T_2;
SET X_SQ_T;
ROW_NUM = _N_;
RUN;
%END;
%IF &TRANSFORM_LOG. = YES %THEN %DO;
DATA TESTING_LOG;
SET TEST_ARRAY_T;
ARRAY COL[*] COL:;
DO I = 1 TO DIM(COL);
COL(I) = LOG(COL(I));
END;
RUN;
PROC TRANSPOSE
DATA = TESTING_LOG
OUT = LOG_T
PREFIX = LOG_
;
ID _NAME_
;
RUN;
DATA LOG_T_2;
SET LOG_T;
ROW_NUM = _N_;
RUN;
%END;
PROC SQL;
CREATE TABLE FULL_DATA AS
SELECT
f.*
%IF &TRANSFORM_X_SQ. = YES %THEN %DO;
, x.*
%END;
%IF &TRANSFORM_LOG. = YES %THEN %DO;
, l.*
%END;
FROM
TEST_ARRAY_2 f
%IF &TRANSFORM_X_SQ. = YES %THEN %DO;
LEFT JOIN
X_SQ_T_2 x ON f.row_num = x.row_num
%END;
%IF &TRANSFORM_LOG. = YES %THEN %DO;
LEFT JOIN
LOG_T_2 l ON l.row_num = x.row_num
%END;
;
QUIT;
%MEND;
%TEST;
使用 3 个数组,一个用于输入值(例如 X_31415
),两个用于新计算值(对数和平方)。
技巧是根据原始变量名动态生成计算变量的变量名。
/* Use dictionary table to get/generate vnames */ proc sql ; select name, /* X_31415 */ tranwrd(name,'X_','X_SQ_'), /* X_SQ_31415 */ tranwrd(name,'X_','LOG_X_') /* LOG_X_31415 */ into :VARLIST separated by ' ', :SQLIST separated by ' ', :LOGLIST separated by ' ' from dictionary.columns where libname = 'WORK' and memname = 'MYDATA' and name like 'X_%' order by input(scan(name,-1,'_'),8.) /* order based on the numeric suffix */ ; quit ;
现在您可以分配三个数组,循环输入值并相应地计算平方和对数。
data array3 ; set mydata ; array in{*} &VARLIST ; /* X_1 X_17 X_31415 X_99999 */ array sq{*} &SQLIST ; /* X_SQ_1 X_SQ_17 X_SQ_31415 X_SQ_99999 */ array lg{*} &LOGLIST ; /* LOG_X_1 LOG_X_17 LOG_X_31415 LOG_X_99999 */ do i = 1 to dim(in) ; sq{i} = in{i} ** 2 ; lg{i} = log(in{i}) ; end ; drop i ; run ;
当变量较多时,可能需要使用SAS File I/O函数对变量进行迭代。此示例创建一个包含 55,000 个响应变量的数据集,并计算它们的平方和对数变换。
%macro make_have(nvar=10);
%local dsid suffix;
data cols;
do index = 100000 to 999999;
if ranuni(123) < &nvar / (1e6-1e5) then output;
end;
run;
data have;
do id = 1 to 10;
sex = ceil(2*ranuni(123));
age = 17 + ceil(52*ranuni(123));
weight = 150 + ceil(100*ranuni(123));
%let dsid = %sysfunc(open (cols));
%do %while (0 = %sysfunc(fetch(&dsid)));
%let suffix = %sysfunc(getvarn(&dsid,1));
x_&suffix = ranuni(123);
%end;
%let dsid = %sysfunc(close(&dsid));
output;
end;
run;
%mend;
options nomprint;
%make_have(nvar=55000);
%macro make_transforms(data=, vars=, new=, function=);
%local dsid i nvar varname;
%let dsid = %sysfunc(open (&data));
%do i = 1 %to %sysfunc(attrn(&dsid,nvar));
%let varname = %sysfunc(varname(&dsid,&i));
%if %substr(&varname,1,%length(&vars)) = &vars %then %do;
&new.%substr(&varname,%length(&vars)+1) = %sysfunc(tranwrd(&function,#,&varname));
%end;
%end;
%let dsid = %sysfunc(close(&dsid));
%mend;
data want;
set have;
%let t0 = %sysfunc(datetime());
%make_transforms(data=have, vars=x_, new=x_sq_, function=#**2)
%make_transforms(data=have, vars=x_, new=x_log_, function=log(#))
%put NOTE: codegen elapsed: %sysevalf(%sysfunc(datetime())-&t0);
run;