使用 SSE 和 AVX 查找矩阵中的最大元素及其列和行索引
Find largest element in matrix and its column and row indexes using SSE and AVX
我需要找到一维矩阵中的最大元素及其列索引和行索引。
我用的是一维矩阵,所以只需要先找到最大元素的索引,然后很容易得到行和列。
我的问题是我无法获取该索引。
我有一个找到最大元素并使用 SSE 的工作函数,这里是:
float find_largest_element_in_matrix_SSE(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m128 max_el = _mm_loadu_ps(m);
__m128 curr;
for (i = 4; i < dims * dims; i += 4)
{
curr = _mm_loadu_ps(m + i);
max_el = _mm_max_ps(max_el, curr);
}
__declspec(align(16))float max_v[4] = { 0 };
_mm_store_ps(max_v, max_el);
return max(max(max(max_v[0], max_v[1]), max_v[2]), max_v[3]);
}
而且我还有一个使用 AVX 的非工作函数:
float find_largest_element_in_matrix_AVX(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m256 max_el = _mm256_loadu_ps(m);
__m256 curr;
for (i = 8; i < dims * dims; i += 8)
{
curr = _mm256_loadu_ps(m + i);
max_el = _mm256_max_ps(max_el, curr);
}
__declspec(align(32))float max_v[8] = { 0 };
_mm256_store_ps(max_v, max_el);
__m256 y = _mm256_permute2f128_ps(max_el, max_el, 1);
__m256 m1 = _mm256_max_ps(max_el, y);m1[1] = max(max_el[1], max_el[3])
__m256 m2 = _mm256_permute_ps(m1, 5);
__m256 m_res = _mm256_max_ps(m1, m2);
return m[0];
}
谁能帮我找到最大元素的索引并让我的 AVX 版本正常工作?
这是一个有效的 SSE (SSE 4) 实现,returns 最大值和相应的索引,以及标量参考实现和测试工具:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#include <smmintrin.h> // SSE 4.1
float find_largest_element_in_matrix_ref(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
int i;
*maxIndex = 0;
for (i = 1; i < dims * dims; ++i)
{
if (m[i] > maxVal)
{
maxVal = m[i];
*maxIndex = i;
}
}
return maxVal;
}
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
float aMaxVal[4];
int32_t aMaxIndex[4];
int i;
*maxIndex = 0;
const __m128i vIndexInc = _mm_set1_epi32(4);
__m128i vMaxIndex = _mm_setr_epi32(0, 1, 2, 3);
__m128i vIndex = vMaxIndex;
__m128 vMaxVal = _mm_loadu_ps(m);
for (i = 4; i < dims * dims; i += 4)
{
__m128 v = _mm_loadu_ps(&m[i]);
__m128 vcmp = _mm_cmpgt_ps(v, vMaxVal);
vIndex = _mm_add_epi32(vIndex, vIndexInc);
vMaxVal = _mm_max_ps(vMaxVal, v);
vMaxIndex = _mm_blendv_epi8(vMaxIndex, vIndex, _mm_castps_si128(vcmp));
}
_mm_storeu_ps(aMaxVal, vMaxVal);
_mm_storeu_si128((__m128i *)aMaxIndex, vMaxIndex);
maxVal = aMaxVal[0];
*maxIndex = aMaxIndex[0];
for (i = 1; i < 4; ++i)
{
if (aMaxVal[i] > maxVal)
{
maxVal = aMaxVal[i];
*maxIndex = aMaxIndex[i];
}
}
return maxVal;
}
int main()
{
const int dims = 1024;
float m[dims * dims];
float maxVal_ref, maxVal_SSE;
int maxIndex_ref = -1, maxIndex_SSE = -1;
int i;
srand(time(NULL));
for (i = 0; i < dims * dims; ++i)
{
m[i] = (float)rand() / RAND_MAX;
}
maxVal_ref = find_largest_element_in_matrix_ref(m, dims, &maxIndex_ref);
maxVal_SSE = find_largest_element_in_matrix_SSE(m, dims, &maxIndex_SSE);
if (maxVal_ref == maxVal_SSE && maxIndex_ref == maxIndex_SSE)
{
printf("PASS: maxVal = %f, maxIndex = %d\n",
maxVal_ref, maxIndex_ref);
}
else
{
printf("FAIL: maxVal_ref = %f, maxVal_SSE = %f, maxIndex_ref = %d, maxIndex_SSE = %d\n",
maxVal_ref, maxVal_SSE, maxIndex_ref, maxIndex_SSE);
}
return 0;
}
编译并运行:
$ gcc -Wall -msse4 Yakovenko.c && ./a.out
PASS: maxVal = 0.999999, maxIndex = 120409
显然,如果需要,您可以获得行和列索引:
int rowIndex = maxIndex / dims;
int colIndex = maxIndex % dims;
从这里开始,编写 AVX2 实现应该相当简单。
一种方法是在第一遍中计算最大值,并在第二遍中通过线性搜索找到索引。这是 SSE2 中的示例实现:
#define anybit __builtin_ctz //or lookup table with 16 entries...
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex) {
//first pass: calculate maximum as usual
__m128 vMaxVal = _mm_loadu_ps(m);
for (int i = 4; i < dims * dims; i += 4)
vMaxVal = _mm_max_ps(vMaxVal, _mm_loadu_ps(&m[i]));
//perform in-register reduction
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
//second pass: search for maximal value
for (int i = 0; i < dims * dims; i += 4) {
__m128 vIsMax = _mm_cmpeq_ps(vMaxVal, _mm_loadu_ps(&m[i]));
if (int mask = _mm_movemask_ps(vIsMax)) {
*maxIndex = i + anybit(mask);
return _mm_cvtss_f32(vMaxVal);
}
}
}
请注意,除非您的输入数据非常小,否则应该几乎完美地预测第二个循环中的分支。
该解决方案存在几个问题,特别是:
在出现奇怪的浮点值时可能无法正常工作,例如NaNs.
如果您的矩阵不适合 CPU 缓存,那么代码将从主内存读取矩阵两次,因此它会比单遍方法慢两倍.这可以通过块处理来解决大型矩阵。
- 在第一个循环中,每次迭代都依赖于前一次迭代(
vMaxVal
既被修改又被读取),因此它会因 _mm_max_ps
的延迟而变慢。也许将第一个循环展开一点(2x 或 4x)会很好,同时为 vMaxVal
设置 4 个独立的寄存器(实际上,第二个循环也将从展开中受益)。
移植到 AVX 应该非常简单,除了寄存器内减少:
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_permute2f128_ps(vMaxVal, vMaxVal, 1));
另一种方法:
void find_largest_element_in_matrix_SSE(float * matrix, size_t n, int * row, int * column, float * v){
__m128 indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 update = _mm_setr_ps(4, 4, 4, 4);
__m128 max_indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 max = _mm_load_ps(matrix);
for (int i = 4; i < n * n; i+=4){
indecies = _mm_add_ps(indecies, update);
__m128 pm2 = _mm_load_ps(&matrix[i]);
__m128 mask = _mm_cmpge_ps(max, pm2);
max = _mm_max_ps(max, pm2);
max_indecies = _mm_or_ps(_mm_and_ps(max_indecies, mask), _mm_andnot_ps(mask, indecies));
}
__declspec (align(16)) int max_ind[4];
__m128i maxi = _mm_cvtps_epi32(max_indecies);
_mm_store_si128((__m128i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 4; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
void find_largest_element_in_matrix_AVX(float * matrix, size_t n, int * row, int * column, float * v){
__m256 indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 update = _mm256_setr_ps(8, 8, 8, 8, 8, 8, 8, 8);
__m256 max_indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 max = _mm256_load_ps(matrix);
for (int i = 8; i < n * n; i += 8){
indecies = _mm256_add_ps(indecies, update);
__m256 pm2 = _mm256_load_ps(&matrix[i]);
__m256 mask = _mm256_cmp_ps(max, pm2, _CMP_GE_OQ);
max = _mm256_max_ps(max, pm2);
max_indecies = _mm256_or_ps(_mm256_and_ps(max_indecies, mask), _mm256_andnot_ps(mask, indecies));
}
__declspec (align(32)) int max_ind[8];
__m256i maxi = _mm256_cvtps_epi32(max_indecies);
_mm256_store_si256((__m256i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 8; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
我需要找到一维矩阵中的最大元素及其列索引和行索引。
我用的是一维矩阵,所以只需要先找到最大元素的索引,然后很容易得到行和列。
我的问题是我无法获取该索引。
我有一个找到最大元素并使用 SSE 的工作函数,这里是:
float find_largest_element_in_matrix_SSE(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m128 max_el = _mm_loadu_ps(m);
__m128 curr;
for (i = 4; i < dims * dims; i += 4)
{
curr = _mm_loadu_ps(m + i);
max_el = _mm_max_ps(max_el, curr);
}
__declspec(align(16))float max_v[4] = { 0 };
_mm_store_ps(max_v, max_el);
return max(max(max(max_v[0], max_v[1]), max_v[2]), max_v[3]);
}
而且我还有一个使用 AVX 的非工作函数:
float find_largest_element_in_matrix_AVX(float* m, unsigned const int dims)
{
size_t i;
int index = -1;
__m256 max_el = _mm256_loadu_ps(m);
__m256 curr;
for (i = 8; i < dims * dims; i += 8)
{
curr = _mm256_loadu_ps(m + i);
max_el = _mm256_max_ps(max_el, curr);
}
__declspec(align(32))float max_v[8] = { 0 };
_mm256_store_ps(max_v, max_el);
__m256 y = _mm256_permute2f128_ps(max_el, max_el, 1);
__m256 m1 = _mm256_max_ps(max_el, y);m1[1] = max(max_el[1], max_el[3])
__m256 m2 = _mm256_permute_ps(m1, 5);
__m256 m_res = _mm256_max_ps(m1, m2);
return m[0];
}
谁能帮我找到最大元素的索引并让我的 AVX 版本正常工作?
这是一个有效的 SSE (SSE 4) 实现,returns 最大值和相应的索引,以及标量参考实现和测试工具:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#include <smmintrin.h> // SSE 4.1
float find_largest_element_in_matrix_ref(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
int i;
*maxIndex = 0;
for (i = 1; i < dims * dims; ++i)
{
if (m[i] > maxVal)
{
maxVal = m[i];
*maxIndex = i;
}
}
return maxVal;
}
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex)
{
float maxVal = m[0];
float aMaxVal[4];
int32_t aMaxIndex[4];
int i;
*maxIndex = 0;
const __m128i vIndexInc = _mm_set1_epi32(4);
__m128i vMaxIndex = _mm_setr_epi32(0, 1, 2, 3);
__m128i vIndex = vMaxIndex;
__m128 vMaxVal = _mm_loadu_ps(m);
for (i = 4; i < dims * dims; i += 4)
{
__m128 v = _mm_loadu_ps(&m[i]);
__m128 vcmp = _mm_cmpgt_ps(v, vMaxVal);
vIndex = _mm_add_epi32(vIndex, vIndexInc);
vMaxVal = _mm_max_ps(vMaxVal, v);
vMaxIndex = _mm_blendv_epi8(vMaxIndex, vIndex, _mm_castps_si128(vcmp));
}
_mm_storeu_ps(aMaxVal, vMaxVal);
_mm_storeu_si128((__m128i *)aMaxIndex, vMaxIndex);
maxVal = aMaxVal[0];
*maxIndex = aMaxIndex[0];
for (i = 1; i < 4; ++i)
{
if (aMaxVal[i] > maxVal)
{
maxVal = aMaxVal[i];
*maxIndex = aMaxIndex[i];
}
}
return maxVal;
}
int main()
{
const int dims = 1024;
float m[dims * dims];
float maxVal_ref, maxVal_SSE;
int maxIndex_ref = -1, maxIndex_SSE = -1;
int i;
srand(time(NULL));
for (i = 0; i < dims * dims; ++i)
{
m[i] = (float)rand() / RAND_MAX;
}
maxVal_ref = find_largest_element_in_matrix_ref(m, dims, &maxIndex_ref);
maxVal_SSE = find_largest_element_in_matrix_SSE(m, dims, &maxIndex_SSE);
if (maxVal_ref == maxVal_SSE && maxIndex_ref == maxIndex_SSE)
{
printf("PASS: maxVal = %f, maxIndex = %d\n",
maxVal_ref, maxIndex_ref);
}
else
{
printf("FAIL: maxVal_ref = %f, maxVal_SSE = %f, maxIndex_ref = %d, maxIndex_SSE = %d\n",
maxVal_ref, maxVal_SSE, maxIndex_ref, maxIndex_SSE);
}
return 0;
}
编译并运行:
$ gcc -Wall -msse4 Yakovenko.c && ./a.out
PASS: maxVal = 0.999999, maxIndex = 120409
显然,如果需要,您可以获得行和列索引:
int rowIndex = maxIndex / dims;
int colIndex = maxIndex % dims;
从这里开始,编写 AVX2 实现应该相当简单。
一种方法是在第一遍中计算最大值,并在第二遍中通过线性搜索找到索引。这是 SSE2 中的示例实现:
#define anybit __builtin_ctz //or lookup table with 16 entries...
float find_largest_element_in_matrix_SSE(const float* m, int dims, int *maxIndex) {
//first pass: calculate maximum as usual
__m128 vMaxVal = _mm_loadu_ps(m);
for (int i = 4; i < dims * dims; i += 4)
vMaxVal = _mm_max_ps(vMaxVal, _mm_loadu_ps(&m[i]));
//perform in-register reduction
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm_max_ps(vMaxVal, _mm_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
//second pass: search for maximal value
for (int i = 0; i < dims * dims; i += 4) {
__m128 vIsMax = _mm_cmpeq_ps(vMaxVal, _mm_loadu_ps(&m[i]));
if (int mask = _mm_movemask_ps(vIsMax)) {
*maxIndex = i + anybit(mask);
return _mm_cvtss_f32(vMaxVal);
}
}
}
请注意,除非您的输入数据非常小,否则应该几乎完美地预测第二个循环中的分支。
该解决方案存在几个问题,特别是:
在出现奇怪的浮点值时可能无法正常工作,例如NaNs.
如果您的矩阵不适合 CPU 缓存,那么代码将从主内存读取矩阵两次,因此它会比单遍方法慢两倍.这可以通过块处理来解决大型矩阵。
- 在第一个循环中,每次迭代都依赖于前一次迭代(
vMaxVal
既被修改又被读取),因此它会因_mm_max_ps
的延迟而变慢。也许将第一个循环展开一点(2x 或 4x)会很好,同时为vMaxVal
设置 4 个独立的寄存器(实际上,第二个循环也将从展开中受益)。
移植到 AVX 应该非常简单,除了寄存器内减少:
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(2, 3, 0, 1)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_shuffle_ps(vMaxVal, vMaxVal, _MM_SHUFFLE(1, 0, 3, 2)));
vMaxVal = _mm256_max_ps(vMaxVal, _mm256_permute2f128_ps(vMaxVal, vMaxVal, 1));
另一种方法:
void find_largest_element_in_matrix_SSE(float * matrix, size_t n, int * row, int * column, float * v){
__m128 indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 update = _mm_setr_ps(4, 4, 4, 4);
__m128 max_indecies = _mm_setr_ps(0, 1, 2, 3);
__m128 max = _mm_load_ps(matrix);
for (int i = 4; i < n * n; i+=4){
indecies = _mm_add_ps(indecies, update);
__m128 pm2 = _mm_load_ps(&matrix[i]);
__m128 mask = _mm_cmpge_ps(max, pm2);
max = _mm_max_ps(max, pm2);
max_indecies = _mm_or_ps(_mm_and_ps(max_indecies, mask), _mm_andnot_ps(mask, indecies));
}
__declspec (align(16)) int max_ind[4];
__m128i maxi = _mm_cvtps_epi32(max_indecies);
_mm_store_si128((__m128i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 4; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}
void find_largest_element_in_matrix_AVX(float * matrix, size_t n, int * row, int * column, float * v){
__m256 indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 update = _mm256_setr_ps(8, 8, 8, 8, 8, 8, 8, 8);
__m256 max_indecies = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 max = _mm256_load_ps(matrix);
for (int i = 8; i < n * n; i += 8){
indecies = _mm256_add_ps(indecies, update);
__m256 pm2 = _mm256_load_ps(&matrix[i]);
__m256 mask = _mm256_cmp_ps(max, pm2, _CMP_GE_OQ);
max = _mm256_max_ps(max, pm2);
max_indecies = _mm256_or_ps(_mm256_and_ps(max_indecies, mask), _mm256_andnot_ps(mask, indecies));
}
__declspec (align(32)) int max_ind[8];
__m256i maxi = _mm256_cvtps_epi32(max_indecies);
_mm256_store_si256((__m256i *) max_ind, maxi);
int c = max_ind[0];
for (int i = 1; i < 8; i++)
if (matrix[max_ind[i]] >= matrix[c] && max_ind[i] < c){
c = max_ind[i];
}
*v = matrix[c];
*row = c / n;
*column = c % n;
}