带有 if 比较的 ARM Neon armv7 SIMD 指令
ARM Neon armv7 SIMD instruction with if comparison
如何为以下循环编写 neon 代码:
float sfx[64], delta = 9.9e-5;
for(int i = 0; i < 64; i++) {
if (sfx[i] < delta) {
abq[i] = 1.0/delta;
} else {
abq[i] = 1.0/sfx[i];
}
}
我尝试使用vbslq_f32,但我必须一个一个地构造它的参数。为什么 NEON 不提供更方便的方式来完成这项工作?有没有更好的方法来做到这一点?
float32x4_t vdelta = vdupq_n_f32((float)1.0/delta);
for(int i = 0; i < 64; i+=4) {
float32x4_t vsfx = vld1q_f32((const float32_t*)(sfx+i));
uint32x4_t vcon;
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,0)<delta), vcon, 0);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,1)<delta), vcon, 1);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,2)<delta), vcon, 2);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,3)<delta), vcon, 3);
float32x4_t vsfxdiv;
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,0)), vsfxdiv, 0);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,1)), vsfxdiv, 1);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,2)), vsfxdiv, 2);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,3)), vsfxdiv, 3);
float32x4_t vabq = vblsq_f32(vcon, vsfxdiv, vdelta);
vst1q_f32((abq+i), vabq);
}
确实,通过一次一个通道执行操作来破坏矢量化点有点愚蠢。这也几乎没有必要。
这个:
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,0)<delta), vcon, 0);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,1)<delta), vcon, 1);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,2)<delta), vcon, 2);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,3)<delta), vcon, 3);
只是一种愚蠢的做法:
float32x4_t vdelta = vdupq_n_f32(delta);
// vector compare less than
vcon = vcltq_f32(vsfx, vdelta);
除法有点尴尬,因为NEON没有除法指令,但我们实际上不需要通用除法;本次倒数运算:
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,0)), vsfxdiv, 0);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,1)), vsfxdiv, 1);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,2)), vsfxdiv, 2);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,3)), vsfxdiv, 3);
可以改为向量化为:
// reciprocal estimate; if precision isn't all that critical, this may suffice on its own
vsfxdiv = vrecpeq_f32(vsfx);
// otherwise, as a general rule of thumb, two Newton-Raphson iterations is
// probably sufficient for single-precision floats
vsfxdiv = vmulq_f32(vsfxdiv, vrecpsq_f32(vsfxdiv, vsfx));
vsfxdiv = vmulq_f32(vsfxdiv, vrecpsq_f32(vsfxdiv, vsfx));
也就是说,当您考虑到这一点时,这个特殊情况可以进一步简化:
if (sfx[i] < delta) {
abq[i] = 1.0/delta;
} else {
abq[i] = 1.0/sfx[i];
}
就是这样:
abq[i] = 1.0/max(delta, sfx[i]);
这意味着显式比较和条件 select 可以完全省略,我们最终得到:
float32x4_t vdelta = vdupq_n_f32(delta);
for(int i = 0; i < 64; i+=4) {
float32x4_t vsfx, vabq;
vsfx = vld1q_f32((const float32_t*)(sfx+i));
vsfx = vmaxq_f32(vsfx, vdelta);
vabq = vrecpeq_f32(vsfx);
vabq = vmulq_f32(vabq, vrecpsq_f32(vabq, vsfx));
vabq = vmulq_f32(vabq, vrecpsq_f32(vabq, vsfx));
vst1q_f32((abq+i), vabq);
}
如何为以下循环编写 neon 代码:
float sfx[64], delta = 9.9e-5;
for(int i = 0; i < 64; i++) {
if (sfx[i] < delta) {
abq[i] = 1.0/delta;
} else {
abq[i] = 1.0/sfx[i];
}
}
我尝试使用vbslq_f32,但我必须一个一个地构造它的参数。为什么 NEON 不提供更方便的方式来完成这项工作?有没有更好的方法来做到这一点?
float32x4_t vdelta = vdupq_n_f32((float)1.0/delta);
for(int i = 0; i < 64; i+=4) {
float32x4_t vsfx = vld1q_f32((const float32_t*)(sfx+i));
uint32x4_t vcon;
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,0)<delta), vcon, 0);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,1)<delta), vcon, 1);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,2)<delta), vcon, 2);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,3)<delta), vcon, 3);
float32x4_t vsfxdiv;
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,0)), vsfxdiv, 0);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,1)), vsfxdiv, 1);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,2)), vsfxdiv, 2);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,3)), vsfxdiv, 3);
float32x4_t vabq = vblsq_f32(vcon, vsfxdiv, vdelta);
vst1q_f32((abq+i), vabq);
}
确实,通过一次一个通道执行操作来破坏矢量化点有点愚蠢。这也几乎没有必要。
这个:
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,0)<delta), vcon, 0);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,1)<delta), vcon, 1);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,2)<delta), vcon, 2);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,3)<delta), vcon, 3);
只是一种愚蠢的做法:
float32x4_t vdelta = vdupq_n_f32(delta);
// vector compare less than
vcon = vcltq_f32(vsfx, vdelta);
除法有点尴尬,因为NEON没有除法指令,但我们实际上不需要通用除法;本次倒数运算:
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,0)), vsfxdiv, 0);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,1)), vsfxdiv, 1);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,2)), vsfxdiv, 2);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,3)), vsfxdiv, 3);
可以改为向量化为:
// reciprocal estimate; if precision isn't all that critical, this may suffice on its own
vsfxdiv = vrecpeq_f32(vsfx);
// otherwise, as a general rule of thumb, two Newton-Raphson iterations is
// probably sufficient for single-precision floats
vsfxdiv = vmulq_f32(vsfxdiv, vrecpsq_f32(vsfxdiv, vsfx));
vsfxdiv = vmulq_f32(vsfxdiv, vrecpsq_f32(vsfxdiv, vsfx));
也就是说,当您考虑到这一点时,这个特殊情况可以进一步简化:
if (sfx[i] < delta) {
abq[i] = 1.0/delta;
} else {
abq[i] = 1.0/sfx[i];
}
就是这样:
abq[i] = 1.0/max(delta, sfx[i]);
这意味着显式比较和条件 select 可以完全省略,我们最终得到:
float32x4_t vdelta = vdupq_n_f32(delta);
for(int i = 0; i < 64; i+=4) {
float32x4_t vsfx, vabq;
vsfx = vld1q_f32((const float32_t*)(sfx+i));
vsfx = vmaxq_f32(vsfx, vdelta);
vabq = vrecpeq_f32(vsfx);
vabq = vmulq_f32(vabq, vrecpsq_f32(vabq, vsfx));
vabq = vmulq_f32(vabq, vrecpsq_f32(vabq, vsfx));
vst1q_f32((abq+i), vabq);
}