SAS:提高交叉连接的效率
SAS: Improve efficiency of a cross join
在我的项目中,我结合了三个独特的输入源来生成一个乐谱。想象一下这个公式
Integrated score = weight_1 * Score_1 + weight_2 * Score_2 + weight_3 * Score_3
因此,为此,我使用了以下代码
DATA w_matrix_t;
/*Create a row count to identify the model weight combination*/
RETAIN model_combination;
model_combination = 0;
DO n_1 = 0 TO 100 BY 1;
DO n_2 = 0 TO 100 BY 1;
IF (100 - n_1 - n_2) ge 0 AND (100 - n_1 - n_2) le 100 THEN DO;
n_3 = 100 - n_1 - n_2;
model_combination+1;
output;
END;
END;
END;
RUN;
DATA w_matrix;
SET w_matrix_t;
w_1 = n_1/100;
w_2 = n_2/100;
w_3 = n_3/100;
/*Drop the old variables*/
DROP n_1 n_2 n_3;
RUN;
PROC SQL;
CREATE TABLE weights_added AS
SELECT
w.model_combination
, w.w_1
, w.w_2
, w.w_3
, fit.name
, fit.logsalary
, (
w.w_1*fit.crhits +
w.w_2*fit.natbat +
w.w_3*fit.nbb
) AS y_hat_int
FROM
work.w_matrix AS w
CROSS JOIN
sashelp.baseball AS fit
ORDER BY
model_combination;
QUIT;
我的问题是,是否有更有效的方法来进行此连接?目的是创建一个大型 table,其中包含为所有权重组合复制的整个 sashelp.baseball 数据集。
在我的实时数据中,我有三个输入源,每个源有 46,000 个观察值,交叉连接需要 1 小时。我还有三个输入源,每个465,000,我想这会花很长时间。
我这样做的原因是因为我使用 Proc freq 和按组处理(按模型组合)计算我的 Somers' D
500,000 行的 5000 个副本 table 将是一个相当大的 table 有 2.5B 行
这里是数据步进堆叠的例子; weights
每行一份 have
数据集。该示例具有 SET weights
来处理每个权重(通过隐式循环)和 SET have POINT=
/ OUTPUT
在显式循环(内循环)内。内部循环在计算加权和时复制 数据 。
data have;
set sashelp.baseball (obs=200); * keep it small for demonstration;
run;
data weights (keep=comboId w1 w2 w3);
do i = 0 to 100; do j = 0 to 100; if (i+j) <= 100 then do;
comboId + 1;
w1 = i / 100;
w2 = j / 100;
w3 = (100 - i - j) / 100;
output;
end; end; end;
run;
data want (keep=comboid w1-w3 name logsalary y_hat_int);
do while (not endOfWeights);
set weights end = endOfWeights;
do row = 1 to RowsInHave;
set have (keep=name logsalary crhits natbat nbb) nobs = RowsInHave point = row;
y_hat_int = w1 * crhits + w2 * natbat + w3 * nbb;
output;
end;
end;
stop;
run;
proc freq data=want noprint;
by comboId;
table y_hat_int / out=freqout ;
format y_hat_int 4.;
run;
proc contents data=want;
run;
即兴发挥,单个 table 包含 5,151 个副本的 200 行棒球摘录名义上为 72.7MB,因此预计 465K 行的 5,151 个副本 table 具有约 2.4G 行并且是 ~ 170 GB 磁盘。在旋转 @7200 的磁盘上,在您观看最佳 20 分钟的整个过程中实现最佳性能,甚至可能更多。
在我的项目中,我结合了三个独特的输入源来生成一个乐谱。想象一下这个公式
Integrated score = weight_1 * Score_1 + weight_2 * Score_2 + weight_3 * Score_3
因此,为此,我使用了以下代码
DATA w_matrix_t;
/*Create a row count to identify the model weight combination*/
RETAIN model_combination;
model_combination = 0;
DO n_1 = 0 TO 100 BY 1;
DO n_2 = 0 TO 100 BY 1;
IF (100 - n_1 - n_2) ge 0 AND (100 - n_1 - n_2) le 100 THEN DO;
n_3 = 100 - n_1 - n_2;
model_combination+1;
output;
END;
END;
END;
RUN;
DATA w_matrix;
SET w_matrix_t;
w_1 = n_1/100;
w_2 = n_2/100;
w_3 = n_3/100;
/*Drop the old variables*/
DROP n_1 n_2 n_3;
RUN;
PROC SQL;
CREATE TABLE weights_added AS
SELECT
w.model_combination
, w.w_1
, w.w_2
, w.w_3
, fit.name
, fit.logsalary
, (
w.w_1*fit.crhits +
w.w_2*fit.natbat +
w.w_3*fit.nbb
) AS y_hat_int
FROM
work.w_matrix AS w
CROSS JOIN
sashelp.baseball AS fit
ORDER BY
model_combination;
QUIT;
我的问题是,是否有更有效的方法来进行此连接?目的是创建一个大型 table,其中包含为所有权重组合复制的整个 sashelp.baseball 数据集。
在我的实时数据中,我有三个输入源,每个源有 46,000 个观察值,交叉连接需要 1 小时。我还有三个输入源,每个465,000,我想这会花很长时间。
我这样做的原因是因为我使用 Proc freq 和按组处理(按模型组合)计算我的 Somers' D
500,000 行的 5000 个副本 table 将是一个相当大的 table 有 2.5B 行
这里是数据步进堆叠的例子; weights
每行一份 have
数据集。该示例具有 SET weights
来处理每个权重(通过隐式循环)和 SET have POINT=
/ OUTPUT
在显式循环(内循环)内。内部循环在计算加权和时复制 数据 。
data have;
set sashelp.baseball (obs=200); * keep it small for demonstration;
run;
data weights (keep=comboId w1 w2 w3);
do i = 0 to 100; do j = 0 to 100; if (i+j) <= 100 then do;
comboId + 1;
w1 = i / 100;
w2 = j / 100;
w3 = (100 - i - j) / 100;
output;
end; end; end;
run;
data want (keep=comboid w1-w3 name logsalary y_hat_int);
do while (not endOfWeights);
set weights end = endOfWeights;
do row = 1 to RowsInHave;
set have (keep=name logsalary crhits natbat nbb) nobs = RowsInHave point = row;
y_hat_int = w1 * crhits + w2 * natbat + w3 * nbb;
output;
end;
end;
stop;
run;
proc freq data=want noprint;
by comboId;
table y_hat_int / out=freqout ;
format y_hat_int 4.;
run;
proc contents data=want;
run;
即兴发挥,单个 table 包含 5,151 个副本的 200 行棒球摘录名义上为 72.7MB,因此预计 465K 行的 5,151 个副本 table 具有约 2.4G 行并且是 ~ 170 GB 磁盘。在旋转 @7200 的磁盘上,在您观看最佳 20 分钟的整个过程中实现最佳性能,甚至可能更多。