对于简单的 StereoBM 算法,为什么我的代码比 opencv 慢得多?
Why my code is much slower than opencv for a simple StereoBM algorithm?
这是我的测试代码,用于实现一个简单的 testBM 算法,没有预过滤。但是当 window 尺寸较大时需要大约 400 毫秒甚至更多,而 opencv 的 StereoBM(CPU 不是 GPU)需要 20 毫秒。我已经检查了 StereoBM 的来源,但我很难理解它。有谁知道为什么?
下面是我的代码。
void testBM(const Mat &left0,
const Mat &right0,
Mat &disparity,
int SAD,
int searchRange)
{
int cols = left0.cols;
int rows = left0.rows;
int total = cols*rows;
const uchar* data_left = left0.ptr<uchar>(0);
const uchar* data_right = right0.ptr<uchar>(0);
uchar* data_dm = new uchar[total];
int dbNum = 2 * SAD + 1;
int dNum = dbNum * dbNum;
//x is col index in the dbNum * dbNum window
//y is row index in this window
//z is (x + y * cols).
//I compute them in advance for avoid computing repeatedly.
Point3i *dLocDif = new Point3i[dNum];
for (int i = 0; i < dNum; i++)
{
dLocDif[i] = Point3i(i%dbNum - SAD, i / dbNum - SAD, 0);
dLocDif[i].z = dLocDif[i].x + dLocDif[i].y * cols;
}
//I compute disparity difference for eache search range to avoid
//computing repeatedly.
uchar* dif_ = new uchar[total*searchRange];
for (int _search = 0; _search < searchRange; _search++)
{
int th = _search * total;
for (int i = 0; i < total; i++)
{
int c = i % cols - _search;
if (c < 0) continue;
dif_[i+th] = (uchar)std::abs(data_left[i] - data_right[i-_search]);
}
}
for (int p = 0; p < total; p++)
{
int min = 50 * dNum;
int dm = -256;
int _col = p % cols;
int _row = p / cols;
int th = 0;
//I search for the smallest difference between left and right image
// using def_.
for (int _search = 0; _search < searchRange; _search++, th += total)
{
if (_col + _search > cols) break;
int temp = 0;
for (int i = 0; i < dNum; i++)
{
int _c = _col + dLocDif[i].x;
if (_c >= cols || _c < 0) continue;
int _r = _row + dLocDif[i].y;
if (_r >= rows || _r < 0) continue;
temp += dif_[th + p + dLocDif[i].z];
if (temp > min)
{
break;
}
}
if (temp < min)
{
dm = _search;
min = temp;
}
}
data_dm[p] = dm;
}
disparity = Mat(rows, cols, CV_8UC1, data_dm);
}
这里是opencv中StereoBM的基本源码。初始化后我有点困惑。谁能简单解释一下?
static void
findStereoCorrespondenceBM( const Mat& left, const Mat& right,
Mat& disp, Mat& cost, const CvStereoBMState& state,
uchar* buf, int _dy0, int _dy1 )
{
const int ALIGN = 16;
int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
int ndisp = state.numberOfDisparities;
int mindisp = state.minDisparity;
int lofs = MAX(ndisp - 1 + mindisp, 0);
int rofs = -MIN(ndisp - 1 + mindisp, 0);
int width = left.cols, height = left.rows;
int width1 = width - rofs - ndisp + 1;
int ftzero = state.preFilterCap;
int textureThreshold = state.textureThreshold;
int uniquenessRatio = state.uniquenessRatio;
short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
int *sad, *hsad0, *hsad, *hsad_sub, *htext;
uchar *cbuf0, *cbuf;
const uchar* lptr0 = left.data + lofs;
const uchar* rptr0 = right.data + rofs;
const uchar *lptr, *lptr_sub, *rptr;
short* dptr = (short*)disp.data;
int sstep = (int)left.step;
int dstep = (int)(disp.step/sizeof(dptr[0]));
int cstep = (height+dy0+dy1)*ndisp;
int costbuf = 0;
int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
const int TABSZ = 256;
uchar tab[TABSZ];
sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
for( x = 0; x < TABSZ; x++ )
tab[x] = (uchar)std::abs(x - ftzero);
// initialize buffers
memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
for( x = -wsz2-1; x < wsz2; x++ )
{
hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep;
for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
{
int lval = lptr[0];
for( d = 0; d < ndisp; d++ )
{
int diff = std::abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] = (int)(hsad[d] + diff);
}
htext[y] += tab[lval];
}
}
// initialize the left and right borders of the disparity map
for( y = 0; y < height; y++ )
{
for( x = 0; x < lofs; x++ )
dptr[y*dstep + x] = FILTERED;
for( x = lofs + width1; x < width; x++ )
dptr[y*dstep + x] = FILTERED;
}
dptr += lofs;
for( x = 0; x < width1; x++, dptr++ )
{
int* costptr = cost.data ? (int*)cost.data + lofs + x : &costbuf;
int x0 = x - wsz2 - 1, x1 = x + wsz2;
const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
hsad = hsad0 - dy0*ndisp;
lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
{
int lval = lptr[0];
for( d = 0; d < ndisp; d++ )
{
int diff = std::abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] = hsad[d] + diff - cbuf_sub[d];
}
htext[y] += tab[lval] - tab[lptr_sub[0]];
}
// fill borders
for( y = dy1; y <= wsz2; y++ )
htext[height+y] = htext[height+dy1-1];
for( y = -wsz2-1; y < -dy0; y++ )
htext[y] = htext[-dy0];
// initialize sums
for( d = 0; d < ndisp; d++ )
sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
hsad = hsad0 + (1 - dy0)*ndisp;
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
for( d = 0; d < ndisp; d++ )
sad[d] = (int)(sad[d] + hsad[d]);
int tsum = 0;
for( y = -wsz2-1; y < wsz2; y++ )
tsum += htext[y];
// finally, start the real processing
for( y = 0; y < height; y++ )
{
int minsad = INT_MAX, mind = -1;
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
for( d = 0; d < ndisp; d++ )
{
int currsad = sad[d] + hsad[d] - hsad_sub[d];
sad[d] = currsad;
if( currsad < minsad )
{
minsad = currsad;
mind = d;
}
}
tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
if( tsum < textureThreshold )
{
dptr[y*dstep] = FILTERED;
continue;
}
if( uniquenessRatio > 0 )
{
int thresh = minsad + (minsad * uniquenessRatio/100);
for( d = 0; d < ndisp; d++ )
{
if( sad[d] <= thresh && (d < mind-1 || d > mind+1))
break;
}
if( d < ndisp )
{
dptr[y*dstep] = FILTERED;
continue;
}
}
{
sad[-1] = sad[1];
sad[ndisp] = sad[ndisp-2];
int p = sad[mind+1], n = sad[mind-1];
d = p + n - 2*sad[mind] + std::abs(p - n);
dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
costptr[y*coststep] = sad[mind];
}
}
}
}
OpenCV 并行执行许多算法; parallel_for/do 抽象 TBB、PPL 和 OpenMP 后端。
原始图像被细分为子区域,并对每个子区域执行findStereoCorrespondenceBM()
。这可以通过我们看到的界面实现,因为 cv::Mat
可以用作子图像的视图,而无需复制实际像素数据。您可以在程序执行期间通过查看正在使用的处理器(例如 windows 上的进程浏览器或 unix 上的 top)来验证这一点。
(最初由 Hauke Heibel 作为评论发布)
这是我的测试代码,用于实现一个简单的 testBM 算法,没有预过滤。但是当 window 尺寸较大时需要大约 400 毫秒甚至更多,而 opencv 的 StereoBM(CPU 不是 GPU)需要 20 毫秒。我已经检查了 StereoBM 的来源,但我很难理解它。有谁知道为什么?
下面是我的代码。
void testBM(const Mat &left0,
const Mat &right0,
Mat &disparity,
int SAD,
int searchRange)
{
int cols = left0.cols;
int rows = left0.rows;
int total = cols*rows;
const uchar* data_left = left0.ptr<uchar>(0);
const uchar* data_right = right0.ptr<uchar>(0);
uchar* data_dm = new uchar[total];
int dbNum = 2 * SAD + 1;
int dNum = dbNum * dbNum;
//x is col index in the dbNum * dbNum window
//y is row index in this window
//z is (x + y * cols).
//I compute them in advance for avoid computing repeatedly.
Point3i *dLocDif = new Point3i[dNum];
for (int i = 0; i < dNum; i++)
{
dLocDif[i] = Point3i(i%dbNum - SAD, i / dbNum - SAD, 0);
dLocDif[i].z = dLocDif[i].x + dLocDif[i].y * cols;
}
//I compute disparity difference for eache search range to avoid
//computing repeatedly.
uchar* dif_ = new uchar[total*searchRange];
for (int _search = 0; _search < searchRange; _search++)
{
int th = _search * total;
for (int i = 0; i < total; i++)
{
int c = i % cols - _search;
if (c < 0) continue;
dif_[i+th] = (uchar)std::abs(data_left[i] - data_right[i-_search]);
}
}
for (int p = 0; p < total; p++)
{
int min = 50 * dNum;
int dm = -256;
int _col = p % cols;
int _row = p / cols;
int th = 0;
//I search for the smallest difference between left and right image
// using def_.
for (int _search = 0; _search < searchRange; _search++, th += total)
{
if (_col + _search > cols) break;
int temp = 0;
for (int i = 0; i < dNum; i++)
{
int _c = _col + dLocDif[i].x;
if (_c >= cols || _c < 0) continue;
int _r = _row + dLocDif[i].y;
if (_r >= rows || _r < 0) continue;
temp += dif_[th + p + dLocDif[i].z];
if (temp > min)
{
break;
}
}
if (temp < min)
{
dm = _search;
min = temp;
}
}
data_dm[p] = dm;
}
disparity = Mat(rows, cols, CV_8UC1, data_dm);
}
这里是opencv中StereoBM的基本源码。初始化后我有点困惑。谁能简单解释一下?
static void
findStereoCorrespondenceBM( const Mat& left, const Mat& right,
Mat& disp, Mat& cost, const CvStereoBMState& state,
uchar* buf, int _dy0, int _dy1 )
{
const int ALIGN = 16;
int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
int ndisp = state.numberOfDisparities;
int mindisp = state.minDisparity;
int lofs = MAX(ndisp - 1 + mindisp, 0);
int rofs = -MIN(ndisp - 1 + mindisp, 0);
int width = left.cols, height = left.rows;
int width1 = width - rofs - ndisp + 1;
int ftzero = state.preFilterCap;
int textureThreshold = state.textureThreshold;
int uniquenessRatio = state.uniquenessRatio;
short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
int *sad, *hsad0, *hsad, *hsad_sub, *htext;
uchar *cbuf0, *cbuf;
const uchar* lptr0 = left.data + lofs;
const uchar* rptr0 = right.data + rofs;
const uchar *lptr, *lptr_sub, *rptr;
short* dptr = (short*)disp.data;
int sstep = (int)left.step;
int dstep = (int)(disp.step/sizeof(dptr[0]));
int cstep = (height+dy0+dy1)*ndisp;
int costbuf = 0;
int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
const int TABSZ = 256;
uchar tab[TABSZ];
sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
for( x = 0; x < TABSZ; x++ )
tab[x] = (uchar)std::abs(x - ftzero);
// initialize buffers
memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
for( x = -wsz2-1; x < wsz2; x++ )
{
hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep;
for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
{
int lval = lptr[0];
for( d = 0; d < ndisp; d++ )
{
int diff = std::abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] = (int)(hsad[d] + diff);
}
htext[y] += tab[lval];
}
}
// initialize the left and right borders of the disparity map
for( y = 0; y < height; y++ )
{
for( x = 0; x < lofs; x++ )
dptr[y*dstep + x] = FILTERED;
for( x = lofs + width1; x < width; x++ )
dptr[y*dstep + x] = FILTERED;
}
dptr += lofs;
for( x = 0; x < width1; x++, dptr++ )
{
int* costptr = cost.data ? (int*)cost.data + lofs + x : &costbuf;
int x0 = x - wsz2 - 1, x1 = x + wsz2;
const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
hsad = hsad0 - dy0*ndisp;
lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
{
int lval = lptr[0];
for( d = 0; d < ndisp; d++ )
{
int diff = std::abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] = hsad[d] + diff - cbuf_sub[d];
}
htext[y] += tab[lval] - tab[lptr_sub[0]];
}
// fill borders
for( y = dy1; y <= wsz2; y++ )
htext[height+y] = htext[height+dy1-1];
for( y = -wsz2-1; y < -dy0; y++ )
htext[y] = htext[-dy0];
// initialize sums
for( d = 0; d < ndisp; d++ )
sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
hsad = hsad0 + (1 - dy0)*ndisp;
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
for( d = 0; d < ndisp; d++ )
sad[d] = (int)(sad[d] + hsad[d]);
int tsum = 0;
for( y = -wsz2-1; y < wsz2; y++ )
tsum += htext[y];
// finally, start the real processing
for( y = 0; y < height; y++ )
{
int minsad = INT_MAX, mind = -1;
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
for( d = 0; d < ndisp; d++ )
{
int currsad = sad[d] + hsad[d] - hsad_sub[d];
sad[d] = currsad;
if( currsad < minsad )
{
minsad = currsad;
mind = d;
}
}
tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
if( tsum < textureThreshold )
{
dptr[y*dstep] = FILTERED;
continue;
}
if( uniquenessRatio > 0 )
{
int thresh = minsad + (minsad * uniquenessRatio/100);
for( d = 0; d < ndisp; d++ )
{
if( sad[d] <= thresh && (d < mind-1 || d > mind+1))
break;
}
if( d < ndisp )
{
dptr[y*dstep] = FILTERED;
continue;
}
}
{
sad[-1] = sad[1];
sad[ndisp] = sad[ndisp-2];
int p = sad[mind+1], n = sad[mind-1];
d = p + n - 2*sad[mind] + std::abs(p - n);
dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
costptr[y*coststep] = sad[mind];
}
}
}
}
OpenCV 并行执行许多算法; parallel_for/do 抽象 TBB、PPL 和 OpenMP 后端。
原始图像被细分为子区域,并对每个子区域执行findStereoCorrespondenceBM()
。这可以通过我们看到的界面实现,因为 cv::Mat
可以用作子图像的视图,而无需复制实际像素数据。您可以在程序执行期间通过查看正在使用的处理器(例如 windows 上的进程浏览器或 unix 上的 top)来验证这一点。
(最初由 Hauke Heibel 作为评论发布)