C 中的嵌套循环展开
Nested Loop Unrolling in C
我想通过展开循环来优化我的代码。我尝试申请展开,但我认为我做不到,而且我看不到我的问题。我想将展开循环应用于外循环。
这个循环做矩阵的转置。
这是我应用展开循环的循环:
void transpose(int dim, int *src, int *dst) {
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[j * dim + i] = src[i * dim + j];
}
这是我的展开循环:
void transpose(int dim, int *src, int *dst) {
int i = 0, j = 0, dimi = 0, dimj = 0, tempi = 0;
for (i = 0; i < dim; i += 8) {
for (j = 0; j < dim; j++) {
dimj = j * dim + i;
dimi = i * dim + j;
dst[dimj] = src[dimi];
tempi = i + 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
}
}
}
}
}
}
}
}
}
}
我不确定您当前代码中的错误是什么,但这是另一种方法。
void transpose(int dim, int *src, int *dst) {
int i, j;
for (i = 0; i <= dim-8; i += 8)
{
for (j = 0; j < dim; j++)
{
dst[j * dim + (i+0)] = src[(i+0) * dim + j];
dst[j * dim + (i+1)] = src[(i+1) * dim + j];
dst[j * dim + (i+2)] = src[(i+2) * dim + j];
dst[j * dim + (i+3)] = src[(i+3) * dim + j];
dst[j * dim + (i+4)] = src[(i+4) * dim + j];
dst[j * dim + (i+5)] = src[(i+5) * dim + j];
dst[j * dim + (i+6)] = src[(i+6) * dim + j];
dst[j * dim + (i+7)] = src[(i+7) * dim + j];
}
}
// Use the normal loop for any remaining elements
for (; i < dim; i++)
for (j = 0; j < dim; j++)
dst[j * dim + i] = src[i * dim + j];
}
注意:可以通过引入一个变量来减少乘法的次数:
int jdim = j * dim + i;
dst[jdim + 0] = ...
dst[jdim + 1] = ...
...
dst[jdim + 7] = ...
RHS 也是如此。
展开循环的全部目的是删除测试。您没有对 dim
的值做出任何假设,因此您需要保留所有测试。我怀疑您会看到展开的代码有任何改进,但只有仔细的基准测试才能告诉您对于给定的编译器和体系结构是否有所不同。
有一件事是肯定的:它使代码更难阅读,更容易搞砸。
如果您知道 dim
的最常见值,您可以尝试优化这些值。例如,如果您知道最常见的情况是 3x3 矩阵,您可以这样写:
void transpose(int dim, const int *src, int *dst) {
if (dim == 3) {
dst[0 * 3 + 0] = src[0 * 3 + 0];
dst[0 * 3 + 1] = src[1 * 3 + 0];
dst[0 * 3 + 2] = src[2 * 3 + 0];
dst[1 * 3 + 0] = src[0 * 3 + 1];
dst[1 * 3 + 1] = src[1 * 3 + 1];
dst[1 * 3 + 2] = src[2 * 3 + 1];
dst[2 * 3 + 0] = src[0 * 3 + 2];
dst[2 * 3 + 1] = src[1 * 3 + 2];
dst[2 * 3 + 2] = src[2 * 3 + 2];
} else {
for (int i = 0; i < dim; i++) {
for (int j = 0; j < dim; j++) {
dst[j * dim + i] = src[i * dim + j];
}
}
}
}
现代编译器擅长优化简单的原始代码,利用硬件特定的矢量化功能。除非您确切知道要优化什么以及何时优化,否则他们会比您做得更好,而且不会冒虚假错误的风险。
这是展开循环的示例。请注意,目标是删除条件语句和对变量的依赖性。另外,此代码尚未经过测试。
void transpose(int dim, int *src, int *dst) {
// represent where the data is being read and where it is going
int dstIndex = 0;
int srcIndex = 0;
// precalculate constants used within the loop
int total = dim*dim;
int unrolled = dim / 4;
int dimx0 = dim*0;
int dimx1 = dim*1;
int dimx2 = dim*2;
int dimx3 = dim*3;
int dimx4 = dim*4;
int i = 0;
int j = 0;
// since the matrix is being transposed i,j order doesn't matter as much
// because one of the matrices will be accessed by column and the other
// will be accessed by row (more effecient)
for (j = 0; j < dim; j++) {
for (i = 0; i < unrolled; i++) {
// here the loop is being unrolled
// notice that each statement does not rely on previous statements
// and there is no conditional code
dst[dstIndex + 0] = src[srcIndex + dimx0];
dst[dstIndex + 1] = src[srcIndex + dimx1];
dst[dstIndex + 2] = src[srcIndex + dimx2];
dst[dstIndex + 3] = src[srcIndex + dimx3];
dstIndex += 4;
srcIndex += dimx4;
}
// the transpose was previously completed in larger blocks of 4
// here whtever indices that were not transposed will be taken care of
// e.g. if the matrix was 13x13, the above loop would run 3 times per row
// and this loop would run once per row
for (i = unrolled; i < dim; i++) {
dst[dstIndex] = src[srcIndex];
dstIndex += 1;
srcIndex += dim;
}
// increment the source index
srcIndex %= total;
srcIndex += 1;
}
}
我想通过展开循环来优化我的代码。我尝试申请展开,但我认为我做不到,而且我看不到我的问题。我想将展开循环应用于外循环。
这个循环做矩阵的转置。
这是我应用展开循环的循环:
void transpose(int dim, int *src, int *dst) {
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[j * dim + i] = src[i * dim + j];
}
这是我的展开循环:
void transpose(int dim, int *src, int *dst) {
int i = 0, j = 0, dimi = 0, dimj = 0, tempi = 0;
for (i = 0; i < dim; i += 8) {
for (j = 0; j < dim; j++) {
dimj = j * dim + i;
dimi = i * dim + j;
dst[dimj] = src[dimi];
tempi = i + 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
tempi += 1;
if (tempi < dim) {
dimj = j * dim + tempi;
dimi = tempi * dim + j;
dst[dimj] = src[dimi];
}
}
}
}
}
}
}
}
}
}
我不确定您当前代码中的错误是什么,但这是另一种方法。
void transpose(int dim, int *src, int *dst) {
int i, j;
for (i = 0; i <= dim-8; i += 8)
{
for (j = 0; j < dim; j++)
{
dst[j * dim + (i+0)] = src[(i+0) * dim + j];
dst[j * dim + (i+1)] = src[(i+1) * dim + j];
dst[j * dim + (i+2)] = src[(i+2) * dim + j];
dst[j * dim + (i+3)] = src[(i+3) * dim + j];
dst[j * dim + (i+4)] = src[(i+4) * dim + j];
dst[j * dim + (i+5)] = src[(i+5) * dim + j];
dst[j * dim + (i+6)] = src[(i+6) * dim + j];
dst[j * dim + (i+7)] = src[(i+7) * dim + j];
}
}
// Use the normal loop for any remaining elements
for (; i < dim; i++)
for (j = 0; j < dim; j++)
dst[j * dim + i] = src[i * dim + j];
}
注意:可以通过引入一个变量来减少乘法的次数:
int jdim = j * dim + i;
dst[jdim + 0] = ...
dst[jdim + 1] = ...
...
dst[jdim + 7] = ...
RHS 也是如此。
展开循环的全部目的是删除测试。您没有对 dim
的值做出任何假设,因此您需要保留所有测试。我怀疑您会看到展开的代码有任何改进,但只有仔细的基准测试才能告诉您对于给定的编译器和体系结构是否有所不同。
有一件事是肯定的:它使代码更难阅读,更容易搞砸。
如果您知道 dim
的最常见值,您可以尝试优化这些值。例如,如果您知道最常见的情况是 3x3 矩阵,您可以这样写:
void transpose(int dim, const int *src, int *dst) {
if (dim == 3) {
dst[0 * 3 + 0] = src[0 * 3 + 0];
dst[0 * 3 + 1] = src[1 * 3 + 0];
dst[0 * 3 + 2] = src[2 * 3 + 0];
dst[1 * 3 + 0] = src[0 * 3 + 1];
dst[1 * 3 + 1] = src[1 * 3 + 1];
dst[1 * 3 + 2] = src[2 * 3 + 1];
dst[2 * 3 + 0] = src[0 * 3 + 2];
dst[2 * 3 + 1] = src[1 * 3 + 2];
dst[2 * 3 + 2] = src[2 * 3 + 2];
} else {
for (int i = 0; i < dim; i++) {
for (int j = 0; j < dim; j++) {
dst[j * dim + i] = src[i * dim + j];
}
}
}
}
现代编译器擅长优化简单的原始代码,利用硬件特定的矢量化功能。除非您确切知道要优化什么以及何时优化,否则他们会比您做得更好,而且不会冒虚假错误的风险。
这是展开循环的示例。请注意,目标是删除条件语句和对变量的依赖性。另外,此代码尚未经过测试。
void transpose(int dim, int *src, int *dst) {
// represent where the data is being read and where it is going
int dstIndex = 0;
int srcIndex = 0;
// precalculate constants used within the loop
int total = dim*dim;
int unrolled = dim / 4;
int dimx0 = dim*0;
int dimx1 = dim*1;
int dimx2 = dim*2;
int dimx3 = dim*3;
int dimx4 = dim*4;
int i = 0;
int j = 0;
// since the matrix is being transposed i,j order doesn't matter as much
// because one of the matrices will be accessed by column and the other
// will be accessed by row (more effecient)
for (j = 0; j < dim; j++) {
for (i = 0; i < unrolled; i++) {
// here the loop is being unrolled
// notice that each statement does not rely on previous statements
// and there is no conditional code
dst[dstIndex + 0] = src[srcIndex + dimx0];
dst[dstIndex + 1] = src[srcIndex + dimx1];
dst[dstIndex + 2] = src[srcIndex + dimx2];
dst[dstIndex + 3] = src[srcIndex + dimx3];
dstIndex += 4;
srcIndex += dimx4;
}
// the transpose was previously completed in larger blocks of 4
// here whtever indices that were not transposed will be taken care of
// e.g. if the matrix was 13x13, the above loop would run 3 times per row
// and this loop would run once per row
for (i = unrolled; i < dim; i++) {
dst[dstIndex] = src[srcIndex];
dstIndex += 1;
srcIndex += dim;
}
// increment the source index
srcIndex %= total;
srcIndex += 1;
}
}