使用预取加速随机内存访问

Speed up random memory access using prefetch

我正在尝试使用预取来加速单个程序。我的程序的目的只是为了测试。这是它的作用:

  1. It uses two int buffers of the same size
  2. It reads one-by-one all the values of the first buffer
  3. It reads the value at the index in the second buffer
  4. It sums all the values taken from the second buffer
  5. It does all the previous steps for bigger and bigger
  6. At the end, I print the number of voluntary and involuntary CPU

第一次,第一个缓冲区中的值包含其索引的值(参见下面代码中的函数 createIndexBuffer)。

在我程序的代码中会更清楚:

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>

#define BUFFER_SIZE ((unsigned long) 4096 * 100000)


unsigned int randomUint()
{
  int value = rand() % UINT_MAX;
  return value;
}


unsigned int * createValueBuffer()
{
  unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    valueBuffer[i] = randomUint();
  }

  return (valueBuffer);
}


unsigned int * createIndexBuffer()
{
  unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    indexBuffer[i] = i;
  }

  return (indexBuffer);
}


unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
  unsigned long long sum = 0;

  for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
  {
    unsigned int index = indexBuffer[i];
    sum += valueBuffer[index];
  }

  return (sum);
}


unsigned int computeTimeInMicroSeconds()
{
  unsigned int * valueBuffer = createValueBuffer();
  unsigned int * indexBuffer = createIndexBuffer();

  struct timeval startTime, endTime;
  gettimeofday(&startTime, NULL);

  unsigned long long sum = computeSum(indexBuffer, valueBuffer);

  gettimeofday(&endTime, NULL);

  printf("Sum = %llu\n", sum);
  free(indexBuffer);
  free(valueBuffer);

  return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);

}


int main()
{
  printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
  unsigned int timeInMicroSeconds = computeTimeInMicroSeconds();
  printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
}

如果我启动它,我会得到以下输出:

$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch 
sizeof buffers = 1562Mb
Sum = 439813150288855829
Time: 201172 micro-seconds = 0.201 seconds

快快快!!! 据我所知(我可能是错的),拥有如此快速程序的原因之一是,当我顺序访问我的两个缓冲区时,可以在 CPU 缓存中预取数据。

我们可以让它变得更复杂,以便数据(几乎)预加载到 CPU 缓存中。例如,我们可以只更改 createIndexBuffer 中的函数:

unsigned int * createIndexBuffer()
{
  unsigned int * indexBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    indexBuffer[i] = rand() % BUFFER_SIZE;
  }

  return (indexBuffer);
}

让我们再次尝试该程序:

$ gcc TestPrefetch.c -O3 -o TestPrefetch && ./TestPrefetch 
sizeof buffers = 1562Mb
Sum = 439835307963131237
Time: 3730387 micro-seconds = 3.730 seconds

慢了 18 倍以上!!!

我们现在来解决我的问题。给定新的 createIndexBuffer 函数,我想使用 prefetch

来加速 computeSum 函数
unsigned long long computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer)
{
  unsigned long long sum = 0;

  for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
  {
    __builtin_prefetch((char *) &indexBuffer[i + 1], 0, 0);
    unsigned int index = indexBuffer[i];
    sum += valueBuffer[index];
  }

  return (sum);
}

当然我也必须更改我的 createIndexBuffer 以便它分配一个包含更多元素的缓冲区

我重新启动我的程序:没有更好!由于预取可能比一个 "for" 循环迭代慢,我可能不会在

之前预取一个元素,而是在

之前预取两个元素
    __builtin_prefetch((char *) &indexBuffer[i + 2], 0, 0);

好不了!两个循环迭代? 不是更好吗?三? **我一直试到 50 (!!!) 但我无法提高我的函数的性能 computeSum

我能帮忙理解为什么吗 非常感谢您的帮助

我相信上面的代码是 CPU 自动优化的,没有任何进一步的 space 手动优化。

1.主要问题是indexBuffer是顺序访问的。硬件预取器感知它并自动预取更多值,无需手动调用预取。因此,在迭代 #i 期间,值 indexBuffer[i+1]indexBuffer[i+2]、... 已经在缓存中。 (顺便说一句,不需要在数组末尾添加人工元素:内存访问错误会被预取指令静默忽略)。

您真正需要做的是预取 valueBuffer

__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + 1]], 0, 0);

2. 但是在这种简单的情况下,添加上面的代码行也无济于事。访问内存的成本是数百个周期,而添加指令是~1个周期。您的代码已经将 99% 的时间花在了内存访问上。添加手动预取只会使这一周期更快,而不是更好。

如果您的数学运算更繁重(尝试一下),手动预取将非常有效,例如使用具有大量未优化除法的表达式(每个 20-30 个周期)或调用一些数学函数(log ,罪)。

3. 但即使这样也不能保证有帮助。循环迭代之间的依赖性非常弱,仅通过 sum 变量。这允许 CPU 推测性地执行指令:它可能会同时开始获取 valueBuffer[i+1],同时仍在为 valueBuffer[i].

执行数学运算

对不起。我给你的不是我的代码的正确版本。正确的版本是,你说的:

__builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);

然而,即使是正确的版本,不幸的是也没有更好

然后我调整了我的程序以使用 sin 函数尝试您的建议。

我改编的节目如下:

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#include <math.h>

#define BUFFER_SIZE ((unsigned long) 4096 * 50000)


unsigned int randomUint()
{
  int value = rand() % UINT_MAX;
  return value;
}


unsigned int * createValueBuffer()
{
  unsigned int * valueBuffer = (unsigned int *) malloc(BUFFER_SIZE * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    valueBuffer[i] = randomUint();
  }

  return (valueBuffer);
}


unsigned int * createIndexBuffer(unsigned short prefetchStep)
{
  unsigned int * indexBuffer = (unsigned int *) malloc((BUFFER_SIZE + prefetchStep) * sizeof(unsigned int));
  for (unsigned long i = 0 ; i < BUFFER_SIZE ; i++)
  {
    indexBuffer[i] = rand() % BUFFER_SIZE;
  }

  return (indexBuffer);
}


double computeSum(unsigned int * indexBuffer, unsigned int * valueBuffer, unsigned short prefetchStep)
{
  double sum = 0;

  for (unsigned int i = 0 ; i < BUFFER_SIZE ; i++)
  {
    __builtin_prefetch((char *) &valueBuffer[indexBuffer[i + prefetchStep]], 0, 0);
    unsigned int index = indexBuffer[i];
    sum += sin(valueBuffer[index]);
  }

  return (sum);
}


unsigned int computeTimeInMicroSeconds(unsigned short prefetchStep)
{
  unsigned int * valueBuffer = createValueBuffer();
  unsigned int * indexBuffer = createIndexBuffer(prefetchStep);

  struct timeval startTime, endTime;
  gettimeofday(&startTime, NULL);

  double sum = computeSum(indexBuffer, valueBuffer, prefetchStep);

  gettimeofday(&endTime, NULL);

  printf("prefetchStep = %d, Sum = %f - ", prefetchStep, sum);
  free(indexBuffer);
  free(valueBuffer);

  return ((endTime.tv_sec - startTime.tv_sec) * 1000 * 1000) + (endTime.tv_usec - startTime.tv_usec);

}


int main()
{
  printf("sizeof buffers = %ldMb\n", BUFFER_SIZE * sizeof(unsigned int) / (1024 * 1024));
  for (unsigned short prefetchStep = 0 ; prefetchStep < 250 ; prefetchStep++)
  {
    unsigned int timeInMicroSeconds = computeTimeInMicroSeconds(prefetchStep);
    printf("Time: %u micro-seconds = %.3f seconds\n", timeInMicroSeconds, (double) timeInMicroSeconds / (1000 * 1000));
  }
}

输出为:

$ gcc TestPrefetch.c -O3 -o TestPrefetch -lm && taskset -c 7 ./TestPrefetch 
sizeof buffers = 781Mb
prefetchStep = 0, Sum = -1107.523504 - Time: 20895326 micro-seconds = 20.895 seconds
prefetchStep = 1, Sum = 13456.262424 - Time: 12706720 micro-seconds = 12.707 seconds
prefetchStep = 2, Sum = -20179.289469 - Time: 12136174 micro-seconds = 12.136 seconds
prefetchStep = 3, Sum = 12068.302534 - Time: 11233803 micro-seconds = 11.234 seconds
prefetchStep = 4, Sum = 21071.238160 - Time: 10855348 micro-seconds = 10.855 seconds
prefetchStep = 5, Sum = -22648.280105 - Time: 10517861 micro-seconds = 10.518 seconds
prefetchStep = 6, Sum = 22665.381676 - Time: 9205809 micro-seconds = 9.206 seconds
prefetchStep = 7, Sum = 2461.741268 - Time: 11391088 micro-seconds = 11.391 seconds
...

所以在这里,效果更好!老实说,我几乎可以肯定它不会更好,因为与内存访问相比,数学函数成本更高。

如果有人能给我更多关于为什么现在更好的信息,我将不胜感激

非常感谢

预取通常会获取完整的缓存行。这是typically 64 bytes。因此,随机示例总是为 4 字节的 int 获取 64 字节。是您实际需要的数据的 16 倍,这非常适合减速 18 倍。因此代码仅受内存吞吐量而不是延迟的限制。