与常规卷积算法相比,快速卷积算法有不同的输出吗?
Fast convolution algorithm have different outputs compared to regular convolution algorithm?
我试图通过使用将我的图像数据转换为列向量并将我的卷积问题转换为矩阵乘法问题的技术来加速卷积层中的前向传递。
[想法来自 https://sahnimanas.github.io/post/anatomy-of-a-high-performance-convolution/]
我首先实现了caffe官方的im2col函数github
void im2col_cpu(float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad,const int stride,const int dilation,float* data_col) {
int dil_kernel_h = (kernel_h - 1) * dilation + 1;
int dil_kernel_w = (kernel_w - 1) * dilation + 1;
int height_col = (height + 2 * pad - dil_kernel_h) / stride + 1;
int width_col = (width + 2 * pad - dil_kernel_w) / stride + 1;
int channels_col = channels * kernel_h * kernel_w;
#pragma omp parallel for
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
const int hc0 = h_offset * dilation - pad;
const int wc0 = w_offset * dilation - pad;
for (int h = 0; h < height_col; ++h) {
int h_pad = h * stride + hc0;
const int row_offset = (c * height_col + h) * width_col;
const int srow_offset = (c_im * height + h_pad) * width;
for (int w = 0; w < width_col; ++w) {
int w_pad = w * stride + wc0;
if ((h_pad < height) && (w_pad < width))
*(data_col + row_offset + w) = *(data_im + srow_offset + w_pad);
}
}
}
}
然后将输出与自定义矩阵乘法代码相乘。
void mat_mul(float *A, float *B, float *C, int M, int N, int K, bool has_bias) {
int i, j, k;
if (!has_bias) init(C, M, N); //init() converts C into a 0 matrix
# pragma omp parallel for private(i, j, k)
for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) {
float * ptr_c = &C[i * N];
float * ptr_b = &B[k * N];
float * ptr_a = &A[i * K + k];
for (j = 0; j < N; ++j) {
*(ptr_c+j) += *ptr_a * *(ptr_b + j);
}
}
}
}
于是,我的卷积代码如下:
void Conv2d(Tensor input, Tensor weight, Tensor bias, Tensor output, int stride, int pad, int dilation, bool has_bias) {
int C = input.shape[0], H = input.shape[1], W = input.shape[2];
int K = weight.shape[0], R = weight.shape[2], S = weight.shape[3];
int OH = output.shape[1], OW = output.shape[2];
CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "Output height mismatch");
CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "Output width mismatch");
CHECK_ERROR(weight.shape[1] == C && (!has_bias || bias.shape[0] == K) && output.shape[0] == K, "Channel size mismatch");
float* col = (float *)malloc(sizeof(float) * (C * R * S * H * W));
im2col_cpu(input.buf, C, H, W, R, S, pad, stride, dilation, col);
mat_mul(weight.buf, col, output.buf, K, OH * OW, R * S * C, has_bias);
free(col);
}
然而,事实证明我的代码与使用标准非常慢算法的常规卷积不同,而不是使用矩阵乘法方法的输出与使用以下方法的输出不匹配。
void Conv2d(Tensor input, Tensor weight, Tensor bias, Tensor output, int stride, int pad, int dilation, bool has_bias) {
int C = input.shape[0], H = input.shape[1], W = input.shape[2];
int K = weight.shape[0], R = weight.shape[2], S = weight.shape[3];
int OH = output.shape[1], OW = output.shape[2];
CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "Output height mismatch");
CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "Output width mismatch");
CHECK_ERROR(weight.shape[1] == C && (!has_bias || bias.shape[0] == K) && output.shape[0] == K, "Channel size mismatch");
for (int k = 0; k < K; ++k) {
for (int oh = 0; oh < OH; ++oh) {
for (int ow = 0; ow < OW; ++ow) {
float o = has_bias ? bias.buf[k] : 0;
for (int c = 0; c < C; ++c) {
for (int r = 0; r < R; ++r) {
for (int s = 0; s < S; ++s) {
int h = oh * stride - pad + r * dilation;
int w = ow * stride - pad + s * dilation;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
float i = input.buf[c * H * W + h * W + w];
float f = weight.buf[k * C * R * S + c * R * S + r * S + s];
o += i * f;
}
}
}
output.buf[k * OH * OW + oh * OW + ow] = o;
}
}
}
}
关于为什么我的矩阵乘法代码不起作用的任何想法?
哦,我发现问题出在哪里了。在我的原始代码中,我将偏差设置为
float o = has_bias ? bias.buf[k] : 0;
其中 k 表示 K
个过滤器中的第 k 个过滤器。然而,在我的 mat_mul
代码中,我天真地认为 *(ptr_c+j) += *ptr_a * *(ptr_b + j);
会为最终输出添加适量的偏差。
我更改了我的代码以改为:
void mat_mul(float *A, float *B, float *C, Tensor bias, int M, int N, int K, bool has_bias) {
# pragma omp parallel for
for (int i = 0; i < M; ++i) {
int h_offset = i * N;
for (int j = 0; j < N; ++j) {
C[h_offset + j] = has_bias ? bias.buf[i] : 0;
}
}
int i, j, k;
# pragma omp parallel for private(i, j, k)
for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) {
int ptr_c = i * N;
int ptr_b = k * N;
int ptr_a = i * K + k;
for (j = 0; j < N; ++j) {
C[ptr_c+j] += A[ptr_a] * B[ptr_b + j];
}
}
}
}
这将允许我添加与原始代码相同数量的偏差。
我试图通过使用将我的图像数据转换为列向量并将我的卷积问题转换为矩阵乘法问题的技术来加速卷积层中的前向传递。
[想法来自 https://sahnimanas.github.io/post/anatomy-of-a-high-performance-convolution/]
我首先实现了caffe官方的im2col函数github
void im2col_cpu(float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad,const int stride,const int dilation,float* data_col) {
int dil_kernel_h = (kernel_h - 1) * dilation + 1;
int dil_kernel_w = (kernel_w - 1) * dilation + 1;
int height_col = (height + 2 * pad - dil_kernel_h) / stride + 1;
int width_col = (width + 2 * pad - dil_kernel_w) / stride + 1;
int channels_col = channels * kernel_h * kernel_w;
#pragma omp parallel for
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % kernel_w;
int h_offset = (c / kernel_w) % kernel_h;
int c_im = c / kernel_h / kernel_w;
const int hc0 = h_offset * dilation - pad;
const int wc0 = w_offset * dilation - pad;
for (int h = 0; h < height_col; ++h) {
int h_pad = h * stride + hc0;
const int row_offset = (c * height_col + h) * width_col;
const int srow_offset = (c_im * height + h_pad) * width;
for (int w = 0; w < width_col; ++w) {
int w_pad = w * stride + wc0;
if ((h_pad < height) && (w_pad < width))
*(data_col + row_offset + w) = *(data_im + srow_offset + w_pad);
}
}
}
}
然后将输出与自定义矩阵乘法代码相乘。
void mat_mul(float *A, float *B, float *C, int M, int N, int K, bool has_bias) {
int i, j, k;
if (!has_bias) init(C, M, N); //init() converts C into a 0 matrix
# pragma omp parallel for private(i, j, k)
for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) {
float * ptr_c = &C[i * N];
float * ptr_b = &B[k * N];
float * ptr_a = &A[i * K + k];
for (j = 0; j < N; ++j) {
*(ptr_c+j) += *ptr_a * *(ptr_b + j);
}
}
}
}
于是,我的卷积代码如下:
void Conv2d(Tensor input, Tensor weight, Tensor bias, Tensor output, int stride, int pad, int dilation, bool has_bias) {
int C = input.shape[0], H = input.shape[1], W = input.shape[2];
int K = weight.shape[0], R = weight.shape[2], S = weight.shape[3];
int OH = output.shape[1], OW = output.shape[2];
CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "Output height mismatch");
CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "Output width mismatch");
CHECK_ERROR(weight.shape[1] == C && (!has_bias || bias.shape[0] == K) && output.shape[0] == K, "Channel size mismatch");
float* col = (float *)malloc(sizeof(float) * (C * R * S * H * W));
im2col_cpu(input.buf, C, H, W, R, S, pad, stride, dilation, col);
mat_mul(weight.buf, col, output.buf, K, OH * OW, R * S * C, has_bias);
free(col);
}
然而,事实证明我的代码与使用标准非常慢算法的常规卷积不同,而不是使用矩阵乘法方法的输出与使用以下方法的输出不匹配。
void Conv2d(Tensor input, Tensor weight, Tensor bias, Tensor output, int stride, int pad, int dilation, bool has_bias) {
int C = input.shape[0], H = input.shape[1], W = input.shape[2];
int K = weight.shape[0], R = weight.shape[2], S = weight.shape[3];
int OH = output.shape[1], OW = output.shape[2];
CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "Output height mismatch");
CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "Output width mismatch");
CHECK_ERROR(weight.shape[1] == C && (!has_bias || bias.shape[0] == K) && output.shape[0] == K, "Channel size mismatch");
for (int k = 0; k < K; ++k) {
for (int oh = 0; oh < OH; ++oh) {
for (int ow = 0; ow < OW; ++ow) {
float o = has_bias ? bias.buf[k] : 0;
for (int c = 0; c < C; ++c) {
for (int r = 0; r < R; ++r) {
for (int s = 0; s < S; ++s) {
int h = oh * stride - pad + r * dilation;
int w = ow * stride - pad + s * dilation;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
float i = input.buf[c * H * W + h * W + w];
float f = weight.buf[k * C * R * S + c * R * S + r * S + s];
o += i * f;
}
}
}
output.buf[k * OH * OW + oh * OW + ow] = o;
}
}
}
}
关于为什么我的矩阵乘法代码不起作用的任何想法?
哦,我发现问题出在哪里了。在我的原始代码中,我将偏差设置为
float o = has_bias ? bias.buf[k] : 0;
其中 k 表示 K
个过滤器中的第 k 个过滤器。然而,在我的 mat_mul
代码中,我天真地认为 *(ptr_c+j) += *ptr_a * *(ptr_b + j);
会为最终输出添加适量的偏差。
我更改了我的代码以改为:
void mat_mul(float *A, float *B, float *C, Tensor bias, int M, int N, int K, bool has_bias) {
# pragma omp parallel for
for (int i = 0; i < M; ++i) {
int h_offset = i * N;
for (int j = 0; j < N; ++j) {
C[h_offset + j] = has_bias ? bias.buf[i] : 0;
}
}
int i, j, k;
# pragma omp parallel for private(i, j, k)
for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) {
int ptr_c = i * N;
int ptr_b = k * N;
int ptr_a = i * K + k;
for (j = 0; j < N; ++j) {
C[ptr_c+j] += A[ptr_a] * B[ptr_b + j];
}
}
}
}
这将允许我添加与原始代码相同数量的偏差。