在 Cudafy 中使用算法在 CPU 上工作时不起作用?
Using an algorithm in Cudafy doesn't work when it works on CPU?
我有一个可以对图像执行 2D 透视变换的工作算法。
算法如下:
private Bitmap RescaleImage(double TopLX, double TopLY, double TopRX, double TopRY, double LowLX, double LowLY, double LowRX, double LowRY, int width, int height)
{
byte[] src_bmp = bmp.ToByteArray();
byte[] dst_bmp = new byte[src_bmp.Length];
for (int x = 0; x < width; x++)
{
for (int y = 0; y < height; y++)
{
/*
* relative position
*/
double rx = (double)x / width;
double ry = (double)y / height;
/*
* get top and bottom position
*/
double topX = TopLX + rx * (TopRX - TopLX);
double topY = TopLY + rx * (TopRY - TopLY);
double bottomX = LowLX + rx * (LowRX - LowLX);
double bottomY = LowLY + rx * (LowRY - LowLY);
/*
* select center between top and bottom point
*/
double centerX = topX + ry * (bottomX - topX);
double centerY = topY + ry * (bottomY - topY);
/*
* store result
*/
// get fractions
double xf = centerX - (int)centerX;
double yf = centerY - (int)centerY;
// 4 colors - we're flipping sides so we can use the distance instead of inverting it later
byte cTL0, cTL1, cTL2, cTL3, cTR0, cTR1, cTR2, cTR3, cLL0, cLL1, cLL2, cLL3, cLR0, cLR1, cLR2, cLR3;
cTL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)];
cTL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cTR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)];
cTR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cLL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)];
cLL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
cLR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)];
cLR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
// 4 distances
double dTL = Math.Sqrt(xf * xf + yf * yf);
double dTR = Math.Sqrt((1 - xf) * (1 - xf) + yf * yf);
double dLL = Math.Sqrt(xf * xf + (1 - yf) * (1 - yf));
double dLR = Math.Sqrt((1 - xf) * (1 - xf) + (1 - yf) * (1 - yf));
// 4 parts
double factor = 1.0 / (dTL + dTR + dLL + dLR);
dTL *= factor;
dTR *= factor;
dLL *= factor;
dLR *= factor;
// accumulate parts
double r = dTL * (double)cTL0 + dTR * (double)cTR0 + dLL * (double)cLL0 + dLR * (double)cLR0;
double g = dTL * (double)cTL1 + dTR * (double)cTR1 + dLL * (double)cLL1 + dLR * (double)cLR1;
double b = dTL * (double)cTL2 + dTR * (double)cTR2 + dLL * (double)cLL2 + dLR * (double)cLR2;
byte c0 = (byte)(r + 0.5);
byte c1 = (byte)(g + 0.5);
byte c2 = (byte)(b + 0.5);
dst_bmp[(y * (width * 4)) + (x * 4)] = c0;
dst_bmp[((y * (width * 4)) + (x * 4)) + 1] = c1;
dst_bmp[((y * (width * 4)) + (x * 4)) + 2] = c2;
}
}
Bitmap bmpOut = dst_bmp.ToBitmap(width, height);
return bmpOut;
}
这工作正常,输出正是我想要的。但是我做了一个非常微妙的改变,使它在 GPU 上使用 Cudafy 运行:
public void PerformPerspectiveCorrection(PointF TL, PointF TR, PointF LL, PointF LR)
{
CheckIsSet();
_gpu.Launch(Width, Height).PerspectiveCorrectionSingleOperation(_gdata.SourceImage, _gdata.ResultImage, TL.X, TL.Y, TR.X, TR.Y, LL.X, LL.Y, LR.X, LR.Y, Width, Height);
}
[Cudafy]
private static void PerspectiveCorrectionSingleOperation(GThread thread, byte[] src_bmp, byte[] dst_bmp, double TopLX, double TopLY, double TopRX, double TopRY, double LowLX, double LowLY, double LowRX, double LowRY, int width, int height)
{
int x = thread.blockIdx.x;
int y = thread.threadIdx.x;
/*
* relative position
*/
double rx = (double)x / width;
double ry = (double)y / height;
/*
* get top and bottom position
*/
double topX = TopLX + rx * (TopRX - TopLX);
double topY = TopLY + rx * (TopRY - TopLY);
double bottomX = LowLX + rx * (LowRX - LowLX);
double bottomY = LowLY + rx * (LowRY - LowLY);
/*
* select center between top and bottom point
*/
double centerX = topX + ry * (bottomX - topX);
double centerY = topY + ry * (bottomY - topY);
/*
* store result
*/
// get fractions
double xf = centerX - (int)centerX;
double yf = centerY - (int)centerY;
// 4 colors - we're flipping sides so we can use the distance instead of inverting it later
byte cTL0, cTL1, cTL2, cTR0, cTR1, cTR2, cLL0, cLL1, cLL2, cLR0, cLR1, cLR2;
cTL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)];
cTL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cTR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)];
cTR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cLL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)];
cLL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
cLR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)];
cLR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
// 4 distances
double dTL = Math.Sqrt(xf * xf + yf * yf);
double dTR = Math.Sqrt((1 - xf) * (1 - xf) + yf * yf);
double dLL = Math.Sqrt(xf * xf + (1 - yf) * (1 - yf));
double dLR = Math.Sqrt((1 - xf) * (1 - xf) + (1 - yf) * (1 - yf));
// 4 parts
double factor = 1.0 / (dTL + dTR + dLL + dLR);
dTL *= factor;
dTR *= factor;
dLL *= factor;
dLR *= factor;
// accumulate parts
double r = dTL * (double)cTL0 + dTR * (double)cTR0 + dLL * (double)cLL0 + dLR * (double)cLR0;
double g = dTL * (double)cTL1 + dTR * (double)cTR1 + dLL * (double)cLL1 + dLR * (double)cLR1;
double b = dTL * (double)cTL2 + dTR * (double)cTR2 + dLL * (double)cLL2 + dLR * (double)cLR2;
byte c0 = (byte)(r + 0.5);
byte c1 = (byte)(g + 0.5);
byte c2 = (byte)(b + 0.5);
dst_bmp[(y * (width * 4)) + (x * 4)] = c0;
dst_bmp[((y * (width * 4)) + (x * 4)) + 1] = c1;
dst_bmp[((y * (width * 4)) + (x * 4)) + 2] = c2;
}
我返回的byte[]全为0。我也尝试直接将值 (255) 应用于 dst_bmp 中的所有字节,它似乎只对一行像素执行操作(1280 字节,因为第一行是 320px,每个字节有 4 个字节)像素).
有什么想法吗?这真令人气愤!
找到答案了,跟我的算法一点关系都没有!我正在将浮点值传递给 Cudafy 方法的双精度值。这不可能发生,因为 GPU 是 运行 CUDA "blind",因此在为值分配内存类型之前,将访问浮点数,就好像它们是双精度数而不强制转换它们。
将双精度参数更改为浮点数,效果非常好。
我有一个可以对图像执行 2D 透视变换的工作算法。
算法如下:
private Bitmap RescaleImage(double TopLX, double TopLY, double TopRX, double TopRY, double LowLX, double LowLY, double LowRX, double LowRY, int width, int height)
{
byte[] src_bmp = bmp.ToByteArray();
byte[] dst_bmp = new byte[src_bmp.Length];
for (int x = 0; x < width; x++)
{
for (int y = 0; y < height; y++)
{
/*
* relative position
*/
double rx = (double)x / width;
double ry = (double)y / height;
/*
* get top and bottom position
*/
double topX = TopLX + rx * (TopRX - TopLX);
double topY = TopLY + rx * (TopRY - TopLY);
double bottomX = LowLX + rx * (LowRX - LowLX);
double bottomY = LowLY + rx * (LowRY - LowLY);
/*
* select center between top and bottom point
*/
double centerX = topX + ry * (bottomX - topX);
double centerY = topY + ry * (bottomY - topY);
/*
* store result
*/
// get fractions
double xf = centerX - (int)centerX;
double yf = centerY - (int)centerY;
// 4 colors - we're flipping sides so we can use the distance instead of inverting it later
byte cTL0, cTL1, cTL2, cTL3, cTR0, cTR1, cTR2, cTR3, cLL0, cLL1, cLL2, cLL3, cLR0, cLR1, cLR2, cLR3;
cTL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)];
cTL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cTR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)];
cTR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cLL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)];
cLL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
cLR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)];
cLR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
// 4 distances
double dTL = Math.Sqrt(xf * xf + yf * yf);
double dTR = Math.Sqrt((1 - xf) * (1 - xf) + yf * yf);
double dLL = Math.Sqrt(xf * xf + (1 - yf) * (1 - yf));
double dLR = Math.Sqrt((1 - xf) * (1 - xf) + (1 - yf) * (1 - yf));
// 4 parts
double factor = 1.0 / (dTL + dTR + dLL + dLR);
dTL *= factor;
dTR *= factor;
dLL *= factor;
dLR *= factor;
// accumulate parts
double r = dTL * (double)cTL0 + dTR * (double)cTR0 + dLL * (double)cLL0 + dLR * (double)cLR0;
double g = dTL * (double)cTL1 + dTR * (double)cTR1 + dLL * (double)cLL1 + dLR * (double)cLR1;
double b = dTL * (double)cTL2 + dTR * (double)cTR2 + dLL * (double)cLL2 + dLR * (double)cLR2;
byte c0 = (byte)(r + 0.5);
byte c1 = (byte)(g + 0.5);
byte c2 = (byte)(b + 0.5);
dst_bmp[(y * (width * 4)) + (x * 4)] = c0;
dst_bmp[((y * (width * 4)) + (x * 4)) + 1] = c1;
dst_bmp[((y * (width * 4)) + (x * 4)) + 2] = c2;
}
}
Bitmap bmpOut = dst_bmp.ToBitmap(width, height);
return bmpOut;
}
这工作正常,输出正是我想要的。但是我做了一个非常微妙的改变,使它在 GPU 上使用 Cudafy 运行:
public void PerformPerspectiveCorrection(PointF TL, PointF TR, PointF LL, PointF LR)
{
CheckIsSet();
_gpu.Launch(Width, Height).PerspectiveCorrectionSingleOperation(_gdata.SourceImage, _gdata.ResultImage, TL.X, TL.Y, TR.X, TR.Y, LL.X, LL.Y, LR.X, LR.Y, Width, Height);
}
[Cudafy]
private static void PerspectiveCorrectionSingleOperation(GThread thread, byte[] src_bmp, byte[] dst_bmp, double TopLX, double TopLY, double TopRX, double TopRY, double LowLX, double LowLY, double LowRX, double LowRY, int width, int height)
{
int x = thread.blockIdx.x;
int y = thread.threadIdx.x;
/*
* relative position
*/
double rx = (double)x / width;
double ry = (double)y / height;
/*
* get top and bottom position
*/
double topX = TopLX + rx * (TopRX - TopLX);
double topY = TopLY + rx * (TopRY - TopLY);
double bottomX = LowLX + rx * (LowRX - LowLX);
double bottomY = LowLY + rx * (LowRY - LowLY);
/*
* select center between top and bottom point
*/
double centerX = topX + ry * (bottomX - topX);
double centerY = topY + ry * (bottomY - topY);
/*
* store result
*/
// get fractions
double xf = centerX - (int)centerX;
double yf = centerY - (int)centerY;
// 4 colors - we're flipping sides so we can use the distance instead of inverting it later
byte cTL0, cTL1, cTL2, cTR0, cTR1, cTR2, cLL0, cLL1, cLL2, cLR0, cLR1, cLR2;
cTL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)];
cTL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cTR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)];
cTR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 1];
cTR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 1) * 4)) + 2];
cLL0 = src_bmp[(((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)];
cLL1 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLL2 = src_bmp[((((int)centerY + 1) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
cLR0 = src_bmp[(((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)];
cLR1 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 1];
cLR2 = src_bmp[((((int)centerY + 0) * (width * 4)) + (((int)centerX + 0) * 4)) + 2];
// 4 distances
double dTL = Math.Sqrt(xf * xf + yf * yf);
double dTR = Math.Sqrt((1 - xf) * (1 - xf) + yf * yf);
double dLL = Math.Sqrt(xf * xf + (1 - yf) * (1 - yf));
double dLR = Math.Sqrt((1 - xf) * (1 - xf) + (1 - yf) * (1 - yf));
// 4 parts
double factor = 1.0 / (dTL + dTR + dLL + dLR);
dTL *= factor;
dTR *= factor;
dLL *= factor;
dLR *= factor;
// accumulate parts
double r = dTL * (double)cTL0 + dTR * (double)cTR0 + dLL * (double)cLL0 + dLR * (double)cLR0;
double g = dTL * (double)cTL1 + dTR * (double)cTR1 + dLL * (double)cLL1 + dLR * (double)cLR1;
double b = dTL * (double)cTL2 + dTR * (double)cTR2 + dLL * (double)cLL2 + dLR * (double)cLR2;
byte c0 = (byte)(r + 0.5);
byte c1 = (byte)(g + 0.5);
byte c2 = (byte)(b + 0.5);
dst_bmp[(y * (width * 4)) + (x * 4)] = c0;
dst_bmp[((y * (width * 4)) + (x * 4)) + 1] = c1;
dst_bmp[((y * (width * 4)) + (x * 4)) + 2] = c2;
}
我返回的byte[]全为0。我也尝试直接将值 (255) 应用于 dst_bmp 中的所有字节,它似乎只对一行像素执行操作(1280 字节,因为第一行是 320px,每个字节有 4 个字节)像素).
有什么想法吗?这真令人气愤!
找到答案了,跟我的算法一点关系都没有!我正在将浮点值传递给 Cudafy 方法的双精度值。这不可能发生,因为 GPU 是 运行 CUDA "blind",因此在为值分配内存类型之前,将访问浮点数,就好像它们是双精度数而不强制转换它们。
将双精度参数更改为浮点数,效果非常好。