推力矢量距离计算

thrust vector distance calculation

考虑以下数据集和质心。有 7 个个体和两个均值,每个均具有 8 个维度。它们存储的行主要顺序。

short dim = 8;
float centroids[] = {
    0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612, 
    0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
};   
float data[] = {
    0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
    0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744, 
    0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869, 
    0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769, 
    0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719, 
    0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530, 
    0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114
};

我想计算每个欧氏距离。 c1 - d1, c1 - d2 .... 在 CPU 我会做:

float dist = 0.0, dist_sqrt;
for(int i = 0; i < 2; i++)
    for(int j = 0; j < 7; j++)
    { 
        float dist_sum = 0.0;
        for(int k = 0; k < dim; k++)
        {
            dist = centroids[i * dim + k] - data[j * dim + k];
            dist_sum += dist * dist;
        }
        dist_sqrt = sqrt(dist_sum);
        // do something with the distance
        std::cout << dist_sqrt << std::endl;

    }

THRUST中有没有内置向量距离计算的解决方案?

可以推力完成。解释起来比较费劲,代码也比较密。

开始的关键观察是核心操作可以通过转换后的归约来完成。推力变换操作用于执行向量(个体质心)的元素减法和每个结果的平方,并且减少将结果加在一起以产生欧氏距离的平方。此操作的起点是 thrust::reduce_by_key,但它涉及将数据正确呈现给 reduce_by_key

最后的结果是对上面的每个结果取平方根,我们可以用普通的thrust::transform

以上是完成所有工作的仅有的 2 行推力代码的摘要描述。但是,第一行相当复杂。为了利用并行性,我采用的方法实际上是按顺序 "lay out" 必要的向量,以呈现给 reduce_by_key。举个简单的例子,假设我们有2个质心和4个人,假设我们的维度是2。

centroid 0: C00 C01
centroid 1: C10 C11
individ  0: I00 I01
individ  1: I10 I11
individ  2: I20 I21
individ  3: I30 I31

我们可以 "lay out" 这样的向量:

 C00 C01 C00 C01 C00 C01 C00 C01 C10 C11 C10 C11 C10 C11 C10 C11
 I00 I01 I10 I11 I20 I21 I30 I31 I00 I01 I10 I11 I20 I21 I30 I31

为了方便 reduce_by_key,我们还需要生成键值来描述向量:

   0   0   1   1   0   0   1   1   0   0   1   1   0   0   1   1

以上数据"laid-out"数据集可能比较大,我们不想产生存储和检索成本,所以我们将使用thrust的集合[=47]生成这些"on-the-fly" =] 来完成这项工作。我们将创建一个提供给 transform_iterator 的自定义仿函数,以对 IC 向量进行减法(和平方),为此将它们压缩在一起。向量的 "lay out" 将使用带有额外自定义索引创建仿函数的排列迭代器动态创建,以帮助每个 IC.[=49 中的复制模式=]

因此,从"inside out"开始,步骤顺序如下:

  1. 对于 I (data) 和 C (centr) 使用 counting_iterator 结合自定义索引函子在 transform_iterator 中生成我们需要的索引序列。

  2. 使用在步骤 1 中创建的索引序列和基础 IC 向量,实际上 "lay out" 通过 permutation_iterator (每个布局矢量一个)。

  3. 将 2 "laid out" 虚拟 IC 向量压缩在一起,创建一个 <float, float> 元组向量(虚拟)。

  4. 采用第 3 步中的 zip_iterator,并在 transform_iterator

    [ 中与自定义距离计算函子 ((I-C)^2) 组合=109=]
  5. 使用另一个 transform_iterator,将 counting_iterator 与自定义键生成仿函数结合起来,生成键序列(虚拟)

  6. 将步骤 4 和 5 中的迭代器传递给 reduce_by_key 作为要归约的输入(键、值)。 reduce_by_key 的输出向量也是键和值。我们不需要密钥,所以我们将使用 discard_iterator 来转储它们。我们将保存的值。

以上步骤全部在一行推力代码中完成。

下面是说明上述内容的代码:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/copy.h>
#include <math.h>

#include <time.h>
#include <sys/time.h>
#include <stdlib.h>

#define MAX_DATA 100000000
#define MAX_CENT 5000
#define TOL 0.001

unsigned long long dtime_usec(unsigned long long prev){
#define USECPSEC 1000000ULL
  timeval tv1;
  gettimeofday(&tv1,0);
  return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
}

unsigned verify(float *d1, float *d2, int len){
  unsigned pass = 1;
  for (int i = 0; i < len; i++)
    if (fabsf(d1[i] - d2[i]) > TOL){
      std::cout << "mismatch at:  " << i << " val1: " << d1[i] << " val2: " << d2[i] << std::endl;
      pass = 0;
      break;}
  return pass;
}
void eucl_dist_cpu(const float *centroids, const float *data, float *rdist, int num_centroids, int dim, int num_data, int print){

  int out_idx = 0;
  float dist, dist_sqrt;
  for(int i = 0; i < num_centroids; i++)
    for(int j = 0; j < num_data; j++)
    {
        float dist_sum = 0.0;
        for(int k = 0; k < dim; k++)
        {
            dist = centroids[i * dim + k] - data[j * dim + k];
            dist_sum += dist * dist;
        }
        dist_sqrt = sqrt(dist_sum);
        // do something with the distance
        rdist[out_idx++] = dist_sqrt;
        if (print) std::cout << dist_sqrt << ", ";

    }
    if (print) std::cout << std::endl;
}


struct dkeygen : public thrust::unary_function<int, int>
{
  int dim;
  int numd;

  dkeygen(const int _dim, const int _numd) : dim(_dim), numd(_numd) {};

  __host__ __device__ int operator()(const int val) const {
    return (val/dim);
    }
};

typedef thrust::tuple<float, float> mytuple;
struct my_dist : public thrust::unary_function<mytuple, float>
{
  __host__ __device__ float operator()(const mytuple &my_tuple) const {
    float temp = thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple);
    return temp*temp;
  }
};


struct d_idx : public thrust::unary_function<int, int>
{
  int dim;
  int numd;

  d_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};

  __host__ __device__ int operator()(const int val) const {
    return (val % (dim*numd));
    }
};

struct c_idx : public thrust::unary_function<int, int>
{
  int dim;
  int numd;

  c_idx(int _dim, int _numd) : dim(_dim), numd(_numd) {};

  __host__ __device__ int operator()(const int val) const {
    return (val % dim) + (dim * (val/(dim*numd)));
    }
};

struct my_sqrt : public thrust::unary_function<float, float>
{
  __host__ __device__ float operator()(const float val) const {
    return sqrtf(val);
  }
};


unsigned long long eucl_dist_thrust(thrust::host_vector<float> &centroids, thrust::host_vector<float> &data, thrust::host_vector<float> &dist, int num_centroids, int dim, int num_data, int print){

  thrust::device_vector<float> d_data = data;
  thrust::device_vector<float> d_centr = centroids;
  thrust::device_vector<float> values_out(num_centroids*num_data);

  unsigned long long compute_time = dtime_usec(0);
  thrust::reduce_by_key(thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), dkeygen(dim, num_data)), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(dim*num_data*num_centroids), dkeygen(dim, num_data)),thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_centr.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), c_idx(dim, num_data))), thrust::make_permutation_iterator(d_data.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), d_idx(dim, num_data))))), my_dist()), thrust::make_discard_iterator(), values_out.begin());
  thrust::transform(values_out.begin(), values_out.end(), values_out.begin(), my_sqrt());
  cudaDeviceSynchronize();
  compute_time = dtime_usec(compute_time);

  if (print){
    thrust::copy(values_out.begin(), values_out.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << std::endl;
    }
  thrust::copy(values_out.begin(), values_out.end(), dist.begin());
  return compute_time;
}


int main(int argc, char *argv[]){

  int dim = 8;
  int num_centroids = 2;
  float centroids[] = {
    0.223, 0.002, 0.223, 0.412, 0.334, 0.532, 0.244, 0.612,
    0.742, 0.812, 0.817, 0.353, 0.325, 0.452, 0.837, 0.441
  };
  int num_data = 8;
  float data[] = {
    0.314, 0.504, 0.030, 0.215, 0.647, 0.045, 0.443, 0.325,
    0.731, 0.354, 0.696, 0.604, 0.954, 0.673, 0.625, 0.744,
    0.615, 0.936, 0.045, 0.779, 0.169, 0.589, 0.303, 0.869,
    0.275, 0.406, 0.003, 0.763, 0.471, 0.748, 0.230, 0.769,
    0.903, 0.489, 0.135, 0.599, 0.094, 0.088, 0.272, 0.719,
    0.112, 0.448, 0.809, 0.157, 0.227, 0.978, 0.747, 0.530,
    0.908, 0.121, 0.321, 0.911, 0.884, 0.792, 0.658, 0.114,
    0.721, 0.555, 0.979, 0.412, 0.007, 0.501, 0.844, 0.234
  };
  std::cout << "cpu results: " << std::endl;
  float dist[num_data*num_centroids];
  eucl_dist_cpu(centroids, data, dist, num_centroids, dim, num_data, 1);

  thrust::host_vector<float> h_data(data, data + (sizeof(data)/sizeof(float)));
  thrust::host_vector<float> h_centr(centroids, centroids + (sizeof(centroids)/sizeof(float)));
  thrust::host_vector<float> h_dist(num_centroids*num_data);

  std::cout << "gpu results: " << std::endl;
  eucl_dist_thrust(h_centr, h_data, h_dist, num_centroids, dim, num_data, 1);

  float *data2, *centroids2, *dist2;
  num_centroids = 10;
  num_data = 1000000;

  if (argc > 2) {
    num_centroids = atoi(argv[1]);
    num_data = atoi(argv[2]);
    if ((num_centroids < 1) || (num_centroids > MAX_CENT)) {std::cout << "Num centroids out of range" << std::endl; return 1;}
    if ((num_data < 1) || (num_data > MAX_DATA)) {std::cout << "Num data out of range" << std::endl; return 1;}
    if (num_data * dim * num_centroids > 2000000000) {std::cout << "data set out of range" << std::endl; return 1;}}
  std::cout << "Num Data: " << num_data << std::endl;
  std::cout << "Num Cent: " << num_centroids << std::endl;
  std::cout << "result size: " << ((num_data*num_centroids*4)/1048576) << " Mbytes" << std::endl;
  data2 = new float[dim*num_data];
  centroids2 = new float[dim*num_centroids];
  dist2 = new float[num_data*num_centroids];
  for (int i = 0; i < dim*num_data; i++) data2[i] = rand()/(float)RAND_MAX;
  for (int i = 0; i < dim*num_centroids; i++) centroids2[i] = rand()/(float)RAND_MAX;
  unsigned long long dtime = dtime_usec(0);
  eucl_dist_cpu(centroids2, data2, dist2, num_centroids, dim, num_data, 0);
  dtime = dtime_usec(dtime);
  std::cout << "cpu time: " << dtime/(float)USECPSEC << "s" << std::endl;
  thrust::host_vector<float> h_data2(data2, data2 + (dim*num_data));
  thrust::host_vector<float> h_centr2(centroids2, centroids2 + (dim*num_centroids));
  thrust::host_vector<float> h_dist2(num_data*num_centroids);
  dtime = dtime_usec(0);
  unsigned long long ctime = eucl_dist_thrust(h_centr2, h_data2, h_dist2, num_centroids, dim, num_data, 0);
  dtime = dtime_usec(dtime);
  std::cout << "gpu total time: " << dtime/(float)USECPSEC << "s, gpu compute time: " << ctime/(float)USECPSEC << "s" << std::endl;
  if (!verify(dist2, &(h_dist2[0]), num_data*num_centroids)) {std::cout << "Verification failure." << std::endl; return 1;}
  std::cout << "Success!" << std::endl;

  return 0;

}

备注:

  1. 代码设置为执行 2 次传递,一次使用类似于您的数据集的短传递,并打印输出以供目视检查。然后可以通过命令行大小调整参数(质心数,然后是个体数)输入更大的数据集,以进行基准比较和结果验证。

  2. 与我在评论中所说的相反,推力代码仅 运行 比朴素的单线程 CPU 代码快 25% 左右。您的里程可能会有所不同。

  3. 这只是考虑处理它的一种方式。我有其他的想法,但没有足够的时间来充实它们。

  4. 数据集可能会变得相当大。现在的代码旨在限制 dimension*number_of_centroids*number_of_individuals 的乘积小于 20 亿的数据集。然而,当你接近这个数字时,你将需要一个 GPU 和 CPU,它们都有几 GB 的内存。我简要地探索了更大的数据集大小。需要在不同的地方进行一些代码更改以从例如扩展。 intunsigned long long,等等。但是我没有提供,因为我仍在调查该代码的问题。

  5. 对于在 GPU 上计算欧几里德距离的另一个非推力相关的观察,您可能对 this question 感兴趣。如果您遵循在那里进行的优化顺序,它可能会阐明如何改进此推力代码,或者如何使用另一个非推力实现。

  6. 抱歉,我没能挤出更多的性能。