针对 ARM 优化的 FAST 计算
Optimzed FAST calculation for ARM
我想通过使用 arm neon 库在 ARM cortex a8 上实现我发现的关于 5ms ORB 特征计算的论文。但我已经在为 FAST 特征检测而苦苦挣扎。
所以我尝试实现的论文你可以找到here。
所以首先我不确定 Bright 和 Dark 约束。因此,根据我的理解,如果中心像素周围有 9 个较暗或 9 个较亮的像素,您必须检查 FAST。所以我检查两者。但是现在我遇到了一个问题,如果没有最终的移位操作来计算它是否是一个角,那么我的实现平均已经花费了 3 倍的时间,然后是 opencv 的整个过程的平均计算。到目前为止,这是我的代码,也许有人可以指出我可以对其进行的一些优化。
//detect with opncv
Clock::time_point t0 = Clock::now();
detectors[y]->detect(img, ocv_kps);
Clock::time_point t1 = Clock::now();
vector<Point2f> my_kps;
//threshhold for FAST
const uchar th = 8;
int b_cnt = 0;
int d_cnt = 0;
//array with four possible corners to be processed in parallel
uint32_t id_arr[4];
uint32_t ib_arr[4];
Clock::time_point t01 = Clock::now();
for (int i = 3; i < img.rows - 3; i++) {
//get pointer to seven Image rows three above and three below center and center itself
const uchar* Mt3 = img.ptr<uchar>(i - 3);
const uchar* Mt2 = img.ptr<uchar>(i - 2);
const uchar* Mt1 = img.ptr<uchar>(i - 1);
const uchar* Mc = img.ptr<uchar>(i);
const uchar* Mb1 = img.ptr<uchar>(i + 1);
const uchar* Mb2 = img.ptr<uchar>(i + 2);
const uchar* Mb3 = img.ptr<uchar>(i + 3);
for (int j = 3; j < img.cols - 3; j++) {
const uchar j3 = j + 3;
const uchar j2 = j + 2;
const uchar j1 = j + 1;
const uchar jn3 = j - 3;
const uchar jn2 = j - 2;
const uchar jn1 = j - 1;
//image values for center left right top and bottom intensity of pixel
const uchar c = Mc[j];
const uchar l = Mc[jn3];
const uchar r = Mc[j3];
const uchar t = Mt3[j];
const uchar b = Mb3[j];
//threshold for bright FAST constraint
const uchar thb = c + th;
//bools for bright constraint
const bool cbt = t > thb;
const bool cbb = b > thb;
const bool cbl = l > thb;
const bool cbr = r > thb;
uchar mt3;
uchar mt3n;
uchar mt2;
uchar mt2n;
uchar mt1;
uchar mt1n;
uchar mb3;
uchar mb3n;
uchar mb2;
uchar mb2n;
uchar mb1;
uchar mb1n;
bool bc = false;
//pre test do we have at least two points which fulfill bright constraint
if ((cbl && cbt) || (cbt && cbr) || (cbr && cbb)
|| (cbb && cbl)) {
bc = true;
//get rest of image intensity values of circle
mt3 = Mt3[j1];
mt3n = Mt3[jn1];
mt2 = Mt2[j2];
mt2n = Mt2[jn2];
mt1 = Mt1[j3];
mt1n = Mt1[jn3];
mb3 = Mb3[j1];
mb3n = Mb3[jn1];
mb2 = Mb2[j2];
mb2n = Mb2[jn2];
mb1 = Mb1[j3];
mb1n = Mb1[jn3];
//values for bright constrain
ib_arr[b_cnt] = cbt | ((mt3) > thb) << 1
| ((mt2) > thb) << 2 | ((mt1) > thb) << 3
| (cbr << 4) | ((mb1) > thb) << 5
| ((mb2) > thb) << 6 | ((mb3) > thb) << 7
| cbb << 8 | ((mb3n) > thb) << 9
| ((mb2n) > thb) << 10 | ((mb1n) > thb) << 11
| (cbl) << 12 | ((mt1n) > thb) << 13
| ((mt2n) > thb) << 14 | ((mt3n) > thb) << 15
| (cbt) << 16 | ((mt3) > thb) << 17
| ((mt2) > thb) << 18 | ((mt1) > thb) << 19
| (cbr) << 20 | ((mb1) > thb) << 21
| ((mb2) > thb) << 22 | ((mb3) > thb) << 23;
b_cnt++;
//if we have four possible corners in array check if they are corners
if (b_cnt == 4) {
uint32x2x4_t IB = vld4_u32(ib_arr);
/*
* here the actual shift operation would take place
*/
b_cnt = 0;
}
}
//threshold for dark constraint
const uchar thd = c - th;
//bools for dark constraint
const bool cdl = l < thd;
const bool cdr = r < thd;
const bool cdt = t < thd;
const bool cdb = b < thd;
//pre test do we have at least two points which fulfill dark constraint
if ((cdl && cdt) || (cdt && cdr) || (cdr && cdb)
|| (cdb && cdl)) {
//if bright pre test failed intensity values are not initialised
if (!bc) {
//get rest of image intensity values of circle
mt3 = Mt3[j1];
mt3n = Mt3[jn1];
mt2 = Mt2[j2];
mt2n = Mt2[jn2];
mt1 = Mt1[j3];
mt1n = Mt1[jn3];
mb3 = Mb3[j1];
mb3n = Mb3[jn1];
mb2 = Mb2[j2];
mb2n = Mb2[jn2];
mb1 = Mb1[j3];
mb1n = Mb1[jn3];
}
//bool values for dark constrain
id_arr[d_cnt] = cdt | ((mt3) < thd) << 1
| ((mt2) < thd) << 2 | ((mt1) < thd) << 3
| (cdr) << 4 | ((mb1) < thd) << 5
| ((mb2) < thd) << 6 | ((mb3) < thd) << 7
| (cdb) << 8 | ((mb3n) < thd) << 9
| ((mb2n) < thd) << 10 | ((mb1n) < thd) << 11
| (cdl) << 12 | ((mt1n) < thd) << 13
| ((mt2n) < thd) << 14 | ((mt3n) < thd) << 15
| (cdt) << 16 | ((mt3) < thd) << 17
| ((mt2) < thd) << 18 | ((mt1) < thd) << 19
| (cdr) << 20 | ((mb1) < thd) << 21
| ((mb2) < thd) << 22 | ((mb3) < thd) << 23;
d_cnt++;
//if we have four possible corners in array check if they are corners
if (d_cnt == 4) {
uint32x2x4_t IA = vld4_u32(id_arr);
/*
* here the actual shift operation would take place
*/
d_cnt = 0;
}
int h = cdt;
}
}
}
Clock::time_point t11 = Clock::now();
cout << "my algorithm found " << my_kps.size()
<< " and ocv found " << ocv_kps.size() << endl;
microseconds ms1 = std::chrono::duration_cast < microseconds
> (t1 - t0);
microseconds ms2 = std::chrono::duration_cast < microseconds
> (t11 - t01);
rs.Push((double) ms2.count());
cout << "my algorithm duration " << ms2.count()
<< " and ocv duration is " << ms1.count() << endl;
所以在 Arm Assembler 中深入研究之后。我想出了一个代码,它在 Arm 上的运行速度至少比 Fast9 的内置 OpenCv 实现快 2 倍。您可以在 GitHub 上查看代码。我对任何优化它的建议感到非常高兴。
在我的 Raspberry Pi 3 上,它需要循环:
我的算法 1000ms
OpenCv 为 2000 毫秒
在 320x240 灰度图像上。
我有一个在 raspberry pi 上以 30fps 运行的 ORB 提取器。
https://github.com/0xfaded/pislam
优化真是一门黑魔法,更糟糕的是ARM从未发布过a53的优化指南。我们拥有的最好的是 a57,它可能具有类似的 NEON 单元。
我真的不能在这里提供完整的答案,但我会分享一些我的过程。
我的 FAST 提取器的第一部分加载测试像素环并将它们转换为 16 位向量,就像您的代码所做的那样。我没有直接编写 asm,而是使用了 gcc 内在函数。不过,我确保 gcc:
- 没有将任何寄存器溢出到堆栈
- 为每次比较发出最少数量的指令
您会注意到第一个比较没有用掩码隔离它的位,本来应该是 0x80
。这释放了一个寄存器,否则它会保持一个常量,并且它给了 gcc 足够的摆动空间,不会溢出寄存器。
您还会注意到一些相当可怕的内在用法:
d0 = vbslq_u8(vdupq_n_u8(0x40u), vcgeq_u8(test, dark), d0);
l0 = vbslq_u8(vdupq_n_u8(0x40u), vcleq_u8(test, light), l0);
这相当于
d0 |= test >= dark & 0x40;
l0 |= test >= light & 0x40;
Gcc 会愉快地编译后者,但会发出 1.5 倍的指令。
第二部分是在 16 位向量上进行 FAST-9 测试。下面编译成 16 条指令,但我花了将近一个月的时间断断续续地想出来。
uint8x16_t t0 = vtstq_u8(d0, d1);
uint8x16_t t1 = vtstq_u8(d0, d1);
t0 = vbslq_u8(t0, l0, d0);
t1 = vbslq_u8(t1, l1, d1);
uint8x16_t cntLo = vclzq_u8(t0);
uint8x16_t testLo = t1 << (cntLo - 1);
asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testLo));
uint8x16_t cntHi = vclzq_u8(t1);
uint8x16_t testHi = t0 << (cntHi - 1);
asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testHi));
uint8x16_t result = (cntLo & testLo) | (cntHi & testHi);
result = vtstq_u8(result, result);
令人讨厌的是,gcc 不会将 testLo == 0
编译为 vceq.u8 %q0, %q0, #0
,这是用于与常量零进行比较的特殊指令。我最终手动插入了这些,从而减少了另外几条指令。
希望能提供一些见解。 Fast.h
我想通过使用 arm neon 库在 ARM cortex a8 上实现我发现的关于 5ms ORB 特征计算的论文。但我已经在为 FAST 特征检测而苦苦挣扎。 所以我尝试实现的论文你可以找到here。 所以首先我不确定 Bright 和 Dark 约束。因此,根据我的理解,如果中心像素周围有 9 个较暗或 9 个较亮的像素,您必须检查 FAST。所以我检查两者。但是现在我遇到了一个问题,如果没有最终的移位操作来计算它是否是一个角,那么我的实现平均已经花费了 3 倍的时间,然后是 opencv 的整个过程的平均计算。到目前为止,这是我的代码,也许有人可以指出我可以对其进行的一些优化。
//detect with opncv
Clock::time_point t0 = Clock::now();
detectors[y]->detect(img, ocv_kps);
Clock::time_point t1 = Clock::now();
vector<Point2f> my_kps;
//threshhold for FAST
const uchar th = 8;
int b_cnt = 0;
int d_cnt = 0;
//array with four possible corners to be processed in parallel
uint32_t id_arr[4];
uint32_t ib_arr[4];
Clock::time_point t01 = Clock::now();
for (int i = 3; i < img.rows - 3; i++) {
//get pointer to seven Image rows three above and three below center and center itself
const uchar* Mt3 = img.ptr<uchar>(i - 3);
const uchar* Mt2 = img.ptr<uchar>(i - 2);
const uchar* Mt1 = img.ptr<uchar>(i - 1);
const uchar* Mc = img.ptr<uchar>(i);
const uchar* Mb1 = img.ptr<uchar>(i + 1);
const uchar* Mb2 = img.ptr<uchar>(i + 2);
const uchar* Mb3 = img.ptr<uchar>(i + 3);
for (int j = 3; j < img.cols - 3; j++) {
const uchar j3 = j + 3;
const uchar j2 = j + 2;
const uchar j1 = j + 1;
const uchar jn3 = j - 3;
const uchar jn2 = j - 2;
const uchar jn1 = j - 1;
//image values for center left right top and bottom intensity of pixel
const uchar c = Mc[j];
const uchar l = Mc[jn3];
const uchar r = Mc[j3];
const uchar t = Mt3[j];
const uchar b = Mb3[j];
//threshold for bright FAST constraint
const uchar thb = c + th;
//bools for bright constraint
const bool cbt = t > thb;
const bool cbb = b > thb;
const bool cbl = l > thb;
const bool cbr = r > thb;
uchar mt3;
uchar mt3n;
uchar mt2;
uchar mt2n;
uchar mt1;
uchar mt1n;
uchar mb3;
uchar mb3n;
uchar mb2;
uchar mb2n;
uchar mb1;
uchar mb1n;
bool bc = false;
//pre test do we have at least two points which fulfill bright constraint
if ((cbl && cbt) || (cbt && cbr) || (cbr && cbb)
|| (cbb && cbl)) {
bc = true;
//get rest of image intensity values of circle
mt3 = Mt3[j1];
mt3n = Mt3[jn1];
mt2 = Mt2[j2];
mt2n = Mt2[jn2];
mt1 = Mt1[j3];
mt1n = Mt1[jn3];
mb3 = Mb3[j1];
mb3n = Mb3[jn1];
mb2 = Mb2[j2];
mb2n = Mb2[jn2];
mb1 = Mb1[j3];
mb1n = Mb1[jn3];
//values for bright constrain
ib_arr[b_cnt] = cbt | ((mt3) > thb) << 1
| ((mt2) > thb) << 2 | ((mt1) > thb) << 3
| (cbr << 4) | ((mb1) > thb) << 5
| ((mb2) > thb) << 6 | ((mb3) > thb) << 7
| cbb << 8 | ((mb3n) > thb) << 9
| ((mb2n) > thb) << 10 | ((mb1n) > thb) << 11
| (cbl) << 12 | ((mt1n) > thb) << 13
| ((mt2n) > thb) << 14 | ((mt3n) > thb) << 15
| (cbt) << 16 | ((mt3) > thb) << 17
| ((mt2) > thb) << 18 | ((mt1) > thb) << 19
| (cbr) << 20 | ((mb1) > thb) << 21
| ((mb2) > thb) << 22 | ((mb3) > thb) << 23;
b_cnt++;
//if we have four possible corners in array check if they are corners
if (b_cnt == 4) {
uint32x2x4_t IB = vld4_u32(ib_arr);
/*
* here the actual shift operation would take place
*/
b_cnt = 0;
}
}
//threshold for dark constraint
const uchar thd = c - th;
//bools for dark constraint
const bool cdl = l < thd;
const bool cdr = r < thd;
const bool cdt = t < thd;
const bool cdb = b < thd;
//pre test do we have at least two points which fulfill dark constraint
if ((cdl && cdt) || (cdt && cdr) || (cdr && cdb)
|| (cdb && cdl)) {
//if bright pre test failed intensity values are not initialised
if (!bc) {
//get rest of image intensity values of circle
mt3 = Mt3[j1];
mt3n = Mt3[jn1];
mt2 = Mt2[j2];
mt2n = Mt2[jn2];
mt1 = Mt1[j3];
mt1n = Mt1[jn3];
mb3 = Mb3[j1];
mb3n = Mb3[jn1];
mb2 = Mb2[j2];
mb2n = Mb2[jn2];
mb1 = Mb1[j3];
mb1n = Mb1[jn3];
}
//bool values for dark constrain
id_arr[d_cnt] = cdt | ((mt3) < thd) << 1
| ((mt2) < thd) << 2 | ((mt1) < thd) << 3
| (cdr) << 4 | ((mb1) < thd) << 5
| ((mb2) < thd) << 6 | ((mb3) < thd) << 7
| (cdb) << 8 | ((mb3n) < thd) << 9
| ((mb2n) < thd) << 10 | ((mb1n) < thd) << 11
| (cdl) << 12 | ((mt1n) < thd) << 13
| ((mt2n) < thd) << 14 | ((mt3n) < thd) << 15
| (cdt) << 16 | ((mt3) < thd) << 17
| ((mt2) < thd) << 18 | ((mt1) < thd) << 19
| (cdr) << 20 | ((mb1) < thd) << 21
| ((mb2) < thd) << 22 | ((mb3) < thd) << 23;
d_cnt++;
//if we have four possible corners in array check if they are corners
if (d_cnt == 4) {
uint32x2x4_t IA = vld4_u32(id_arr);
/*
* here the actual shift operation would take place
*/
d_cnt = 0;
}
int h = cdt;
}
}
}
Clock::time_point t11 = Clock::now();
cout << "my algorithm found " << my_kps.size()
<< " and ocv found " << ocv_kps.size() << endl;
microseconds ms1 = std::chrono::duration_cast < microseconds
> (t1 - t0);
microseconds ms2 = std::chrono::duration_cast < microseconds
> (t11 - t01);
rs.Push((double) ms2.count());
cout << "my algorithm duration " << ms2.count()
<< " and ocv duration is " << ms1.count() << endl;
所以在 Arm Assembler 中深入研究之后。我想出了一个代码,它在 Arm 上的运行速度至少比 Fast9 的内置 OpenCv 实现快 2 倍。您可以在 GitHub 上查看代码。我对任何优化它的建议感到非常高兴。 在我的 Raspberry Pi 3 上,它需要循环: 我的算法 1000ms OpenCv 为 2000 毫秒
在 320x240 灰度图像上。
我有一个在 raspberry pi 上以 30fps 运行的 ORB 提取器。
https://github.com/0xfaded/pislam
优化真是一门黑魔法,更糟糕的是ARM从未发布过a53的优化指南。我们拥有的最好的是 a57,它可能具有类似的 NEON 单元。
我真的不能在这里提供完整的答案,但我会分享一些我的过程。
我的 FAST 提取器的第一部分加载测试像素环并将它们转换为 16 位向量,就像您的代码所做的那样。我没有直接编写 asm,而是使用了 gcc 内在函数。不过,我确保 gcc:
- 没有将任何寄存器溢出到堆栈
- 为每次比较发出最少数量的指令
您会注意到第一个比较没有用掩码隔离它的位,本来应该是 0x80
。这释放了一个寄存器,否则它会保持一个常量,并且它给了 gcc 足够的摆动空间,不会溢出寄存器。
您还会注意到一些相当可怕的内在用法:
d0 = vbslq_u8(vdupq_n_u8(0x40u), vcgeq_u8(test, dark), d0);
l0 = vbslq_u8(vdupq_n_u8(0x40u), vcleq_u8(test, light), l0);
这相当于
d0 |= test >= dark & 0x40;
l0 |= test >= light & 0x40;
Gcc 会愉快地编译后者,但会发出 1.5 倍的指令。
第二部分是在 16 位向量上进行 FAST-9 测试。下面编译成 16 条指令,但我花了将近一个月的时间断断续续地想出来。
uint8x16_t t0 = vtstq_u8(d0, d1);
uint8x16_t t1 = vtstq_u8(d0, d1);
t0 = vbslq_u8(t0, l0, d0);
t1 = vbslq_u8(t1, l1, d1);
uint8x16_t cntLo = vclzq_u8(t0);
uint8x16_t testLo = t1 << (cntLo - 1);
asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testLo));
uint8x16_t cntHi = vclzq_u8(t1);
uint8x16_t testHi = t0 << (cntHi - 1);
asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testHi));
uint8x16_t result = (cntLo & testLo) | (cntHi & testHi);
result = vtstq_u8(result, result);
令人讨厌的是,gcc 不会将 testLo == 0
编译为 vceq.u8 %q0, %q0, #0
,这是用于与常量零进行比较的特殊指令。我最终手动插入了这些,从而减少了另外几条指令。
希望能提供一些见解。 Fast.h