使用预取加速随机内存访问
Speed up random memory access using prefetch
我正在尝试使用预取来加速单个程序。我的程序的目的只是为了测试。这是它的作用:
- It uses two int buffers of the same size
- It reads one-by-one all the values of the first buffer
- It reads the value at the index in the second buffer
- It sums all the values taken from the second buffer
- It does all the previous steps for bigger and bigger
- At the end, I print the number of voluntary and involuntary CPU
第一次,第一个缓冲区中的值包含其索引的值(参见下面代码中的函数 createIndexBuffer
)。
在我程序的代码中会更清楚:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 100000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = i;
}
return (indexBuffer);
}
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
unsigned int computeTimeInMicroSeconds()
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer();
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
unsigned long long sum = computeSum(indexBuffer, valueBuffer);
gettimeofday(&endTime, NULL);
printf("Sum = %llu\n", sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds();
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
如果我启动它,我会得到以下输出:
$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch
sizeof buffers = 1562Mb
Sum = 439813150288855829
Time: 201172 micro-seconds = 0.201 seconds
快快快!!!
据我所知(我可能是错的),拥有如此快速程序的原因之一是,当我顺序访问我的两个缓冲区时,可以在 CPU 缓存中预取数据。
我们可以让它变得更复杂,以便数据(几乎)预加载到 CPU 缓存中。例如,我们可以只更改 createIndexBuffer
中的函数:
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
让我们再次尝试该程序:
$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch
sizeof buffers = 1562Mb
Sum = 439835307963131237
Time: 3730387 micro-seconds = 3.730 seconds
慢了 18 倍以上!!!
我们现在来解决我的问题。给定新的 createIndexBuffer
函数,我想使用 prefetch
来加速 computeSum
函数
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
__builtin_prefetch((char *) &indexBuffer[i + 1], 0, 0);
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
当然我也必须更改我的 createIndexBuffer
以便它分配一个包含更多元素的缓冲区
我重新启动我的程序:没有更好!由于预取可能比一个 "for" 循环迭代慢,我可能不会在
之前预取一个元素,而是在 之前预取两个元素
__builtin_prefetch((char *) &indexBuffer[i + 2], 0, 0);
好不了!两个循环迭代? 不是更好吗?三? **我一直试到 50 (!!!) 但我无法提高我的函数的性能 computeSum
。
我能帮忙理解为什么吗
非常感谢您的帮助
我相信上面的代码是 CPU 自动优化的,没有任何进一步的 space 手动优化。
1.主要问题是indexBuffer
是顺序访问的。硬件预取器感知它并自动预取更多值,无需手动调用预取。因此,在迭代 #i 期间,值 indexBuffer[i+1]
、indexBuffer[i+2]
、... 已经在缓存中。 (顺便说一句,不需要在数组末尾添加人工元素:内存访问错误会被预取指令静默忽略)。
您真正需要做的是预取 valueBuffer
:
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + 1]], 0, 0);
2. 但是在这种简单的情况下,添加上面的代码行也无济于事。访问内存的成本是数百个周期,而添加指令是~1个周期。您的代码已经将 99% 的时间花在了内存访问上。添加手动预取只会使这一周期更快,而不是更好。
如果您的数学运算更繁重(尝试一下),手动预取将非常有效,例如使用具有大量未优化除法的表达式(每个 20-30 个周期)或调用一些数学函数(log ,罪)。
3. 但即使这样也不能保证有帮助。循环迭代之间的依赖性非常弱,仅通过 sum
变量。这允许 CPU 推测性地执行指令:它可能会同时开始获取 valueBuffer[i+1]
,同时仍在为 valueBuffer[i]
.
执行数学运算
对不起。我给你的不是我的代码的正确版本。正确的版本是,你说的:
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
然而,即使是正确的版本,不幸的是也没有更好
然后我调整了我的程序以使用 sin
函数尝试您的建议。
我改编的节目如下:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#include <math.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 50000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer(unsigned short prefetchStep)
{
unsigned int * indexBuffer = (unsigned int *) malloc((BUFFER_SIZE + prefetchStep) * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
double computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer, unsigned short prefetchStep)
{
double sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
unsigned int index = indexBuffer[i];
sum += sin(valueBuffer[index]);
}
return (sum);
}
unsigned int computeTimeInMicroSeconds(unsigned short prefetchStep)
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer(prefetchStep);
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
double sum = computeSum(indexBuffer, valueBuffer, prefetchStep);
gettimeofday(&endTime, NULL);
printf("prefetchStep = %d, Sum = %f - ", prefetchStep, sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
for (unsigned short prefetchStep = 0 ; prefetchStep < 250 ; prefetchStep++)
{
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds(prefetchStep);
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
}
输出为:
$ gcc TestPrefetch.c -O3 -o TestPrefetch -lm && taskset -c 7 ./TestPrefetch
sizeof buffers = 781Mb
prefetchStep = 0, Sum = -1107.523504 - Time: 20895326 micro-seconds = 20.895 seconds
prefetchStep = 1, Sum = 13456.262424 - Time: 12706720 micro-seconds = 12.707 seconds
prefetchStep = 2, Sum = -20179.289469 - Time: 12136174 micro-seconds = 12.136 seconds
prefetchStep = 3, Sum = 12068.302534 - Time: 11233803 micro-seconds = 11.234 seconds
prefetchStep = 4, Sum = 21071.238160 - Time: 10855348 micro-seconds = 10.855 seconds
prefetchStep = 5, Sum = -22648.280105 - Time: 10517861 micro-seconds = 10.518 seconds
prefetchStep = 6, Sum = 22665.381676 - Time: 9205809 micro-seconds = 9.206 seconds
prefetchStep = 7, Sum = 2461.741268 - Time: 11391088 micro-seconds = 11.391 seconds
...
所以在这里,效果更好!老实说,我几乎可以肯定它不会更好,因为与内存访问相比,数学函数成本更高。
如果有人能给我更多关于为什么现在更好的信息,我将不胜感激
非常感谢
预取通常会获取完整的缓存行。这是typically 64 bytes。因此,随机示例总是为 4 字节的 int 获取 64 字节。是您实际需要的数据的 16 倍,这非常适合减速 18 倍。因此代码仅受内存吞吐量而不是延迟的限制。
我正在尝试使用预取来加速单个程序。我的程序的目的只是为了测试。这是它的作用:
- It uses two int buffers of the same size
- It reads one-by-one all the values of the first buffer
- It reads the value at the index in the second buffer
- It sums all the values taken from the second buffer
- It does all the previous steps for bigger and bigger
- At the end, I print the number of voluntary and involuntary CPU
第一次,第一个缓冲区中的值包含其索引的值(参见下面代码中的函数 createIndexBuffer
)。
在我程序的代码中会更清楚:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 100000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = i;
}
return (indexBuffer);
}
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
unsigned int computeTimeInMicroSeconds()
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer();
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
unsigned long long sum = computeSum(indexBuffer, valueBuffer);
gettimeofday(&endTime, NULL);
printf("Sum = %llu\n", sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds();
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
如果我启动它,我会得到以下输出:
$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch
sizeof buffers = 1562Mb
Sum = 439813150288855829
Time: 201172 micro-seconds = 0.201 seconds
快快快!!! 据我所知(我可能是错的),拥有如此快速程序的原因之一是,当我顺序访问我的两个缓冲区时,可以在 CPU 缓存中预取数据。
我们可以让它变得更复杂,以便数据(几乎)预加载到 CPU 缓存中。例如,我们可以只更改 createIndexBuffer
中的函数:
unsigned int * createIndexBuffer()
{
unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
让我们再次尝试该程序:
$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch
sizeof buffers = 1562Mb
Sum = 439835307963131237
Time: 3730387 micro-seconds = 3.730 seconds
慢了 18 倍以上!!!
我们现在来解决我的问题。给定新的 createIndexBuffer
函数,我想使用 prefetch
computeSum
函数
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
unsigned long long sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
__builtin_prefetch((char *) &indexBuffer[i + 1], 0, 0);
unsigned int index = indexBuffer[i];
sum += valueBuffer[index];
}
return (sum);
}
当然我也必须更改我的 createIndexBuffer
以便它分配一个包含更多元素的缓冲区
我重新启动我的程序:没有更好!由于预取可能比一个 "for" 循环迭代慢,我可能不会在
之前预取一个元素,而是在 之前预取两个元素 __builtin_prefetch((char *) &indexBuffer[i + 2], 0, 0);
好不了!两个循环迭代? 不是更好吗?三? **我一直试到 50 (!!!) 但我无法提高我的函数的性能 computeSum
。
我能帮忙理解为什么吗 非常感谢您的帮助
我相信上面的代码是 CPU 自动优化的,没有任何进一步的 space 手动优化。
1.主要问题是indexBuffer
是顺序访问的。硬件预取器感知它并自动预取更多值,无需手动调用预取。因此,在迭代 #i 期间,值 indexBuffer[i+1]
、indexBuffer[i+2]
、... 已经在缓存中。 (顺便说一句,不需要在数组末尾添加人工元素:内存访问错误会被预取指令静默忽略)。
您真正需要做的是预取 valueBuffer
:
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + 1]], 0, 0);
2. 但是在这种简单的情况下,添加上面的代码行也无济于事。访问内存的成本是数百个周期,而添加指令是~1个周期。您的代码已经将 99% 的时间花在了内存访问上。添加手动预取只会使这一周期更快,而不是更好。
如果您的数学运算更繁重(尝试一下),手动预取将非常有效,例如使用具有大量未优化除法的表达式(每个 20-30 个周期)或调用一些数学函数(log ,罪)。
3. 但即使这样也不能保证有帮助。循环迭代之间的依赖性非常弱,仅通过 sum
变量。这允许 CPU 推测性地执行指令:它可能会同时开始获取 valueBuffer[i+1]
,同时仍在为 valueBuffer[i]
.
对不起。我给你的不是我的代码的正确版本。正确的版本是,你说的:
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
然而,即使是正确的版本,不幸的是也没有更好
然后我调整了我的程序以使用 sin
函数尝试您的建议。
我改编的节目如下:
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#include <math.h>
#define BUFFER_SIZE ((unsigned long) 4096 * 50000)
unsigned int randomUint()
{
int value = rand() % UINT_MAX;
return value;
}
unsigned int * createValueBuffer()
{
unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
valueBuffer[i] = randomUint();
}
return (valueBuffer);
}
unsigned int * createIndexBuffer(unsigned short prefetchStep)
{
unsigned int * indexBuffer = (unsigned int *) malloc((BUFFER_SIZE + prefetchStep) * sizeof(unsigned int));
for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
{
indexBuffer[i] = rand() % BUFFER_SIZE;
}
return (indexBuffer);
}
double computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer, unsigned short prefetchStep)
{
double sum = 0;
for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
{
__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
unsigned int index = indexBuffer[i];
sum += sin(valueBuffer[index]);
}
return (sum);
}
unsigned int computeTimeInMicroSeconds(unsigned short prefetchStep)
{
unsigned int * valueBuffer = createValueBuffer();
unsigned int * indexBuffer = createIndexBuffer(prefetchStep);
struct timeval startTime, endTime;
gettimeofday(&startTime, NULL);
double sum = computeSum(indexBuffer, valueBuffer, prefetchStep);
gettimeofday(&endTime, NULL);
printf("prefetchStep = %d, Sum = %f - ", prefetchStep, sum);
free(indexBuffer);
free(valueBuffer);
return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);
}
int main()
{
printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
for (unsigned short prefetchStep = 0 ; prefetchStep < 250 ; prefetchStep++)
{
unsigned int timeInMicroSeconds = computeTimeInMicroSeconds(prefetchStep);
printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}
}
输出为:
$ gcc TestPrefetch.c -O3 -o TestPrefetch -lm && taskset -c 7 ./TestPrefetch
sizeof buffers = 781Mb
prefetchStep = 0, Sum = -1107.523504 - Time: 20895326 micro-seconds = 20.895 seconds
prefetchStep = 1, Sum = 13456.262424 - Time: 12706720 micro-seconds = 12.707 seconds
prefetchStep = 2, Sum = -20179.289469 - Time: 12136174 micro-seconds = 12.136 seconds
prefetchStep = 3, Sum = 12068.302534 - Time: 11233803 micro-seconds = 11.234 seconds
prefetchStep = 4, Sum = 21071.238160 - Time: 10855348 micro-seconds = 10.855 seconds
prefetchStep = 5, Sum = -22648.280105 - Time: 10517861 micro-seconds = 10.518 seconds
prefetchStep = 6, Sum = 22665.381676 - Time: 9205809 micro-seconds = 9.206 seconds
prefetchStep = 7, Sum = 2461.741268 - Time: 11391088 micro-seconds = 11.391 seconds
...
所以在这里,效果更好!老实说,我几乎可以肯定它不会更好,因为与内存访问相比,数学函数成本更高。
如果有人能给我更多关于为什么现在更好的信息,我将不胜感激
非常感谢
预取通常会获取完整的缓存行。这是typically 64 bytes。因此,随机示例总是为 4 字节的 int 获取 64 字节。是您实际需要的数据的 16 倍,这非常适合减速 18 倍。因此代码仅受内存吞吐量而不是延迟的限制。