使用英特尔 MKL 计算 `trans(a)*inv(b)*a` 的正确方法(follow-up 问题)
Proper way to calculate `trans(a)*inv(b)*a` with Intel MKL (follow-up question)
这是 的 follow-up 问题(我对其进行了重大编辑,但有人告诉我这应该是另一个问题 - 我想不出另一个标题)。
我正在使用来计算
yn = trans(a)*inv(zt)*a + trans(b)*inv(zl)*b
其中a
和b
是m-by-n实矩阵,zt
和zl
是m-by-m复矩阵。得到的复数矩阵 yn
是 n-by-n.
这是我的做法:
zt <- inv(zt)
zl <- inv(zl)
c <- zt*a
yn <- trans(a)*c
c <- zl*b
yn <- trans(b)*c + yn
C代码:
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>
int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
int i, j;
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
if (j < n - 1) fprintf(fp, ",");
}
fprintf(fp, "\n");
}
return 0;
}
int calc_yn(
_Complex double* yn, double* a, double *b, _Complex double* zl,
_Complex double* zt, int m, int n)
{
lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
free(ipiv);
const double alpha = 1.0;
const double beta = 0.0;
lapack_complex_double* c = (lapack_complex_double*) malloc(
sizeof(lapack_complex_double)*(m*n));
// c <- zt*a
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zt, m, a, n,
&beta, c, n);
FILE* fp = fopen("c1.csv", "w");
print_zmatrix_file(m, n, c, n, fp);
fclose(fp);
// yn <- aT*c
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, a, n, c, n,
&beta, yn, n);
// c <- zl*b
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zl, m, b, n,
&beta, c, n);
FILE* fp2 = fopen("c2.csv", "w");
print_zmatrix_file(m, n, c, n, fp2);
fclose(fp2);
// yn <- bT*c + yn
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, b, n, c, n,
&alpha, yn, n);
free(c);
return 0;
}
int main()
{
int m = 2;
int n = 3;
_Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
double a[] = {
0.5, 0.0, 0.5,
0.5, 0.5, 0.0
};
double b[] = {
1.0, 0.0, -1.0,
1.0, -1.0, 0.0
};
_Complex double zt[] = {
(0.004 + 0.09*I), (-0.004 - 0.12*I),
(-0.004 - 0.12*I), (0.005 + 0.11*I)
};
_Complex double zl[] = {
(0.1 + 2.13*I), (-124.004 - 800.12*I),
(-124.004 - 800.12*I), (0.4 + 4.08*I)
};
calc_yn(yn, a, b, zl, zt, m, n);
FILE* fp = fopen("yn.csv", "w");
print_zmatrix_file(n, n, yn, n, fp);
fclose(fp);
free(yn);
return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl
在 malloc
到 yn
中有一个错误(它使用 sizeof(_Complex double*)
而不是 sizeof(_Complex double)
)。更正该错误后,代码将成功编译并运行。在 运行 之后,我将结果与 SciPy 得到的结果进行了比较。他们不同意。
import numpy
from scipy import linalg
a = numpy.array([[0.5, 0.0, 0.5],
[0.5, 0.5, 0.0]])
b = numpy.array([[1.0, 0.0, -1.0],
[1.0, -1.0, 0.0]])
zt = numpy.array([[0.004 + 0.09j, -0.004 - 0.12j],
[-0.004 - 0.12j, 0.005 + 0.11j]])
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j],
[124.004 - 800.12j, 0.4 + 4.08j]])
c1 = numpy.matmul(linalg.inv(zt), a)
m1 = numpy.matmul(a.T, c1)
c2 = numpy.matmul(linalg.inv(zl), b)
m2 = numpy.matmul(b.T, c2)
yn = m1 + m2
yn_file = numpy.genfromtxt('yn.csv', delimiter=',', dtype=numpy.complex128)
c1_file = numpy.genfromtxt('c1.csv', delimiter=',', dtype=numpy.complex128)
c2_file = numpy.genfromtxt('c2.csv', delimiter=',', dtype=numpy.complex128)
numpy.max(numpy.abs(yn)) #0.004958820819049211
numpy.max(numpy.abs(yn_file)) #60.4590237745794
numpy.max(numpy.abs(c1)) #25.549314567403204
numpy.max(numpy.abs(c1_file)) #41.278805716697306
numpy.max(numpy.abs(c2)) #0.0012411403762584482
numpy.max(numpy.abs(c2_file)) #0.03292682468747935
我的 C 代码或 Python 代码有问题。为什么我得到不同的结果?
编辑:根据 进一步测试。他注意到 copy-paste 错误,其中 -124.004 - 800.12i
在 Python 代码中显示为 +124.004 - 800.12i
。更正不会改变结果
为了更容易测试,我使用了矩阵:
a = numpy.array([[1.0, 0.0],
[0.0, 1.0]])
b = numpy.array([[0.0, -1.0],
[-1.0, 0.0]])
zt = a
zl = b
结果是
yn = [[1.0, -1.0]
[-1.0, 1.0]]
Python 代码给出了那个结果,但是 C 代码给出了
yn = [[0.0 + 2.0j, 1.0 + 2.0j]
[-1.0 + 2.0j, 0.0 + 0.0j]]
这让我断定C代码是错误的,但我不知道在哪里。
来自您问题中发布的代码:
Python:
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j], ## <==HERE
[124.004 - 800.12j, 0.4 + 4.08j]]) ## <==HERE ALSO
C:
_Complex double zl[] = {
(0.1 + 2.13*I), (-124.004 - 800.12*I), // <==HERE
(-124.004 - 800.12*I), (0.4 + 4.08*I) // <== HERE ALSO
我注意到一个是 -124.004 - 800.12i,另一个是 124.004 - 800.12i。我不确定您要使用哪一个,但将它们都设置为同一个,看看结果是否仍然不同。如果它们仍然不同,请将它们都设置为您知道结果的单元可测试值(a=[1 0 0; 0 1 0; 0 0 1] 或易于计算的值)。这将告诉您哪一个(或两个)is/are 不正确。
该行为(很可能未定义)是由将 double
数组传递给 zgemm
而不是 _Complex double
引起的。当我将矩阵 a
和 b
更改为复数时,我得到了预期的结果。
这里是用于测试的固定C代码:
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>
int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
int i, j;
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
if (j < n - 1) fprintf(fp, ",");
}
fprintf(fp, "\n");
}
return 0;
}
int calc_yn(
_Complex double* yn, _Complex double* a, _Complex double *b,
_Complex double* zl, _Complex double* zt, int m, int n)
{
lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
free(ipiv);
const double alpha = 1.0;
const double beta = 0.0;
lapack_complex_double* c = (lapack_complex_double*) malloc(
sizeof(lapack_complex_double)*(m*n));
// c <- zt*a
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zt, m, a, n,
&beta, c, n);
// yn <- aT*c
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, a, n, c, n,
&beta, yn, n);
// c <- zl*b
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zl, m, b, n,
&beta, c, n);
// yn <- bT*c + yn
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, b, n, c, n,
&alpha, yn, n);
free(c);
return 0;
}
int main()
{
int m = 2;
int n = 2;
_Complex double a[] = {
1.0, 0.0,
0.0, 1.0
};
_Complex double b[] = {
0.0, -1.0,
-1.0, 0.0
};
_Complex double zt[] = {
1.0, 0.0,
0.0, 1.0
};
_Complex double zl[] = {
0.0, -1.0,
-1.0, 0.0
};
_Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
calc_yn(yn, a, b, zl, zt, m, n);
FILE* fp = fopen("yn.csv", "w");
print_zmatrix_file(n, n, yn, n, fp);
fclose(fp);
free(yn);
return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl
这是
我正在使用
yn = trans(a)*inv(zt)*a + trans(b)*inv(zl)*b
其中a
和b
是m-by-n实矩阵,zt
和zl
是m-by-m复矩阵。得到的复数矩阵 yn
是 n-by-n.
这是我的做法:
zt <- inv(zt)
zl <- inv(zl)
c <- zt*a
yn <- trans(a)*c
c <- zl*b
yn <- trans(b)*c + yn
C代码:
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>
int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
int i, j;
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
if (j < n - 1) fprintf(fp, ",");
}
fprintf(fp, "\n");
}
return 0;
}
int calc_yn(
_Complex double* yn, double* a, double *b, _Complex double* zl,
_Complex double* zt, int m, int n)
{
lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
free(ipiv);
const double alpha = 1.0;
const double beta = 0.0;
lapack_complex_double* c = (lapack_complex_double*) malloc(
sizeof(lapack_complex_double)*(m*n));
// c <- zt*a
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zt, m, a, n,
&beta, c, n);
FILE* fp = fopen("c1.csv", "w");
print_zmatrix_file(m, n, c, n, fp);
fclose(fp);
// yn <- aT*c
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, a, n, c, n,
&beta, yn, n);
// c <- zl*b
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zl, m, b, n,
&beta, c, n);
FILE* fp2 = fopen("c2.csv", "w");
print_zmatrix_file(m, n, c, n, fp2);
fclose(fp2);
// yn <- bT*c + yn
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, b, n, c, n,
&alpha, yn, n);
free(c);
return 0;
}
int main()
{
int m = 2;
int n = 3;
_Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
double a[] = {
0.5, 0.0, 0.5,
0.5, 0.5, 0.0
};
double b[] = {
1.0, 0.0, -1.0,
1.0, -1.0, 0.0
};
_Complex double zt[] = {
(0.004 + 0.09*I), (-0.004 - 0.12*I),
(-0.004 - 0.12*I), (0.005 + 0.11*I)
};
_Complex double zl[] = {
(0.1 + 2.13*I), (-124.004 - 800.12*I),
(-124.004 - 800.12*I), (0.4 + 4.08*I)
};
calc_yn(yn, a, b, zl, zt, m, n);
FILE* fp = fopen("yn.csv", "w");
print_zmatrix_file(n, n, yn, n, fp);
fclose(fp);
free(yn);
return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl
malloc
到 yn
中有一个错误(它使用 sizeof(_Complex double*)
而不是 sizeof(_Complex double)
)。更正该错误后,代码将成功编译并运行。在 运行 之后,我将结果与 SciPy 得到的结果进行了比较。他们不同意。
import numpy
from scipy import linalg
a = numpy.array([[0.5, 0.0, 0.5],
[0.5, 0.5, 0.0]])
b = numpy.array([[1.0, 0.0, -1.0],
[1.0, -1.0, 0.0]])
zt = numpy.array([[0.004 + 0.09j, -0.004 - 0.12j],
[-0.004 - 0.12j, 0.005 + 0.11j]])
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j],
[124.004 - 800.12j, 0.4 + 4.08j]])
c1 = numpy.matmul(linalg.inv(zt), a)
m1 = numpy.matmul(a.T, c1)
c2 = numpy.matmul(linalg.inv(zl), b)
m2 = numpy.matmul(b.T, c2)
yn = m1 + m2
yn_file = numpy.genfromtxt('yn.csv', delimiter=',', dtype=numpy.complex128)
c1_file = numpy.genfromtxt('c1.csv', delimiter=',', dtype=numpy.complex128)
c2_file = numpy.genfromtxt('c2.csv', delimiter=',', dtype=numpy.complex128)
numpy.max(numpy.abs(yn)) #0.004958820819049211
numpy.max(numpy.abs(yn_file)) #60.4590237745794
numpy.max(numpy.abs(c1)) #25.549314567403204
numpy.max(numpy.abs(c1_file)) #41.278805716697306
numpy.max(numpy.abs(c2)) #0.0012411403762584482
numpy.max(numpy.abs(c2_file)) #0.03292682468747935
我的 C 代码或 Python 代码有问题。为什么我得到不同的结果?
编辑:根据 -124.004 - 800.12i
在 Python 代码中显示为 +124.004 - 800.12i
。更正不会改变结果
为了更容易测试,我使用了矩阵:
a = numpy.array([[1.0, 0.0],
[0.0, 1.0]])
b = numpy.array([[0.0, -1.0],
[-1.0, 0.0]])
zt = a
zl = b
结果是
yn = [[1.0, -1.0]
[-1.0, 1.0]]
Python 代码给出了那个结果,但是 C 代码给出了
yn = [[0.0 + 2.0j, 1.0 + 2.0j]
[-1.0 + 2.0j, 0.0 + 0.0j]]
这让我断定C代码是错误的,但我不知道在哪里。
来自您问题中发布的代码:
Python:
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j], ## <==HERE
[124.004 - 800.12j, 0.4 + 4.08j]]) ## <==HERE ALSO
C:
_Complex double zl[] = {
(0.1 + 2.13*I), (-124.004 - 800.12*I), // <==HERE
(-124.004 - 800.12*I), (0.4 + 4.08*I) // <== HERE ALSO
我注意到一个是 -124.004 - 800.12i,另一个是 124.004 - 800.12i。我不确定您要使用哪一个,但将它们都设置为同一个,看看结果是否仍然不同。如果它们仍然不同,请将它们都设置为您知道结果的单元可测试值(a=[1 0 0; 0 1 0; 0 0 1] 或易于计算的值)。这将告诉您哪一个(或两个)is/are 不正确。
该行为(很可能未定义)是由将 double
数组传递给 zgemm
而不是 _Complex double
引起的。当我将矩阵 a
和 b
更改为复数时,我得到了预期的结果。
这里是用于测试的固定C代码:
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>
int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
int i, j;
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
if (j < n - 1) fprintf(fp, ",");
}
fprintf(fp, "\n");
}
return 0;
}
int calc_yn(
_Complex double* yn, _Complex double* a, _Complex double *b,
_Complex double* zl, _Complex double* zt, int m, int n)
{
lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
free(ipiv);
const double alpha = 1.0;
const double beta = 0.0;
lapack_complex_double* c = (lapack_complex_double*) malloc(
sizeof(lapack_complex_double)*(m*n));
// c <- zt*a
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zt, m, a, n,
&beta, c, n);
// yn <- aT*c
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, a, n, c, n,
&beta, yn, n);
// c <- zl*b
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zl, m, b, n,
&beta, c, n);
// yn <- bT*c + yn
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, b, n, c, n,
&alpha, yn, n);
free(c);
return 0;
}
int main()
{
int m = 2;
int n = 2;
_Complex double a[] = {
1.0, 0.0,
0.0, 1.0
};
_Complex double b[] = {
0.0, -1.0,
-1.0, 0.0
};
_Complex double zt[] = {
1.0, 0.0,
0.0, 1.0
};
_Complex double zl[] = {
0.0, -1.0,
-1.0, 0.0
};
_Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
calc_yn(yn, a, b, zl, zt, m, n);
FILE* fp = fopen("yn.csv", "w");
print_zmatrix_file(n, n, yn, n, fp);
fclose(fp);
free(yn);
return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl