如何使用 sse 将两个 float 数组相乘?

How to multply two float arrays using sse?

void sse_mul_float(float* a, float* b, int N)                                                                                                                                                                          
{                                                                                                                                                                                                                
  int nb_iters = N / 4;                                                                                                                                                                                         

  __m128* l = (__m128*)a;                                                                                                                                                                                      
  __m128* r = (__m128*)b;                                                                                                                                                                                      

  for (int i = 0; i < nb_iters; ++i, ++l, ++r)                                                                                                                                                                   
    _mm_store_ps(l, _mm_mul_ps (l, r));
}

我想将一个浮点数组的元素与另一个浮点数组的元素相乘,我想将结果存储在第一个数组中

我收到此错误: 无法将参数“1”的“__m128* {aka __vector(4) float*}”转换为“__m128 {aka __vector(4) float}”到“ __m128 _mm_mul_ps(__m128, __m128)'

void sse_mul_float(float* a, float* b, int N)                                                                                                                                                                          
{          
  int nb_iters = N / sizeof(float);                                                                                                                                                                                         

  float c;
  __m128* l = (__m128*)a;                                                                                                                                                                                      
  __m128* r = (__m128*)b;


  for (int i = 0; i < nb_iters; ++i, ++l, ++r)
  {
      _mm_store_ps(&a[i*4], _mm_mul_ps(*l, *r));
  }

}