使用 CuSolver 对 Hermitian 矩阵进行特征分解与 matlab 的结果不匹配

Eigen decomposition of Hermitian Matrix using CuSolver does not match the result with matlab

我正在按照此处的特征分解示例进行操作, https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSOLVER/syevd/cusolver_syevd_example.cu

我需要为 Hermatian 复数矩阵执行此操作。问题是特征向量与 Matlab 结果完全不匹配。

有人知道为什么会发生这种不匹配吗?

我也尝试过 cusolverdn svd 方法来获取给出另一个结果的特征值和向量。

我的代码在这里是为了方便,


#include <cstdio>
#include <cstdlib>
#include <vector>

#include <cuda_runtime.h>
#include <cusolverDn.h>

#include "cusolver_utils.h"

int N = 16;
void BuildMatrix(cuComplex* input);

void main()
{
    cusolverDnHandle_t cusolverH = NULL;
    cudaStream_t stream = NULL;
    printf("*******************\n");

    cuComplex* h_idata = (cuComplex*)malloc(sizeof(cuComplex) * N);
    cuComplex* h_eigenVector = (cuComplex*)malloc(sizeof(cuComplex) * N); // eigen vector
    float* h_eigenValue = (float*)malloc(sizeof(float) * 4);    // eigen Value
    BuildMatrix(h_idata);
    int count = 0;
    for (int i = 0; i < N / 4; i++)
    {
        for (int j = 0; j < 4; j++)
        {
            printf("%f + %f\t", h_idata[count].x, h_idata[count].y);
            count++;
        }
        printf("\n");
    }
    printf("\n*****************\n");

    /* step 1: create cusolver handle, bind a stream */
    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));

    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));

    // step 2: reserve memory in cuda and copy input data from host to device
    cuComplex* d_idata;
    float* d_eigenValue = nullptr;
    int* d_info = nullptr;

    CUDA_CHECK(cudaMalloc((void**)&d_idata, N * sizeof(cuComplex)));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_eigenValue), N * sizeof(float)));
    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));

    CUDA_CHECK(cudaMemcpyAsync(d_idata, h_idata, N * sizeof(cuComplex), cudaMemcpyHostToDevice, stream));

    // step 3: query working space of syevd
    cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
    cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;

    int lwork = 0;            /* size of workspace */
    cuComplex* d_work = nullptr; /* device workspace*/
    const int m = 4;
    const int lda = m;

    cusolverDnCheevd_bufferSize(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, &lwork);

    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));

    // step 4: compute spectrum
    cusolverDnCheevd(cusolverH, jobz, uplo, m, d_idata, lda, d_eigenValue, d_work, lwork, d_info);

    CUDA_CHECK(
        cudaMemcpyAsync(h_eigenVector, d_idata, N * sizeof(cuComplex), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(
        cudaMemcpyAsync(h_eigenValue, d_eigenValue, 4 * sizeof(double), cudaMemcpyDeviceToHost, stream));

    int info = 0;
    CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    std::printf("after syevd: info = %d\n", info);
    if (0 > info)
    {
        std::printf("%d-th parameter is wrong \n", -info);
        exit(1);
    }

    count = 0;
    for (int i = 0; i < N / 4; i++)
    {
        for (int j = 0; j < 4; j++)
        {
            printf("%f + %f\t", h_eigenVector[count].x, h_eigenVector[count].y);
            count++;
        }
        printf("\n");
    }
    printf("\n");
    for (int i = 0; i < N / 4; i++)
    {
        std::cout << h_eigenValue[i] << std::endl;
    }
    printf("\n*****************\n");

    /* free resources */
    CUDA_CHECK(cudaFree(d_idata));
    CUDA_CHECK(cudaFree(d_eigenValue));
    CUDA_CHECK(cudaFree(d_info));
    CUDA_CHECK(cudaFree(d_work));

    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaDeviceReset());
}

//0.5560 + 0.0000i - 0.4864 + 0.0548i   0.8592 + 0.2101i - 1.5374 - 0.2069i
//- 0.4864 - 0.0548i   0.4317 + 0.0000i - 0.7318 - 0.2698i   1.3255 + 0.3344i
//0.8592 - 0.2101i - 0.7318 + 0.2698i   1.4099 + 0.0000i - 2.4578 + 0.2609i
//- 1.5374 + 0.2069i   1.3255 - 0.3344i - 2.4578 - 0.2609i   4.3333 + 0.0000i
void BuildMatrix(cuComplex* input)
{
    std::vector<float> realVector = { 0.5560, -0.4864, 0.8592, -1.5374, -0.4864, 0.4317, -0.7318, 1.3255,
                                    0.8592, -0.7318, 1.4099, -2.4578, -1.5374, 1.3255, -2.4578, 4.3333 };
    std::vector<float> imagVector = { 0, -0.0548, -0.2101, 0.2069, 0.0548, 0.0000, 0.2698, -0.3344,
                                     0.2101, -0.2698, 0, -0.2609, -0.2069, 0.3344, 0.2609, 0 };

    for (int i = 0; i < N; i++)
    {
        input[i].x = realVector.at(i) * std::pow(10, 11);
        input[i].y = imagVector.at(i) * std::pow(10, 11);
    }
}

我在他们的 git ( https://github.com/NVIDIA/CUDALibrarySamples/issues/58) 中提出了这个问题,但不幸的是没有人回答。

如果有人能帮我解决这个问题,那将非常有帮助。

明确答案请关注post, https://forums.developer.nvidia.com/t/eigen-decomposition-of-hermitian-matrix-using-cusolver-does-not-match-the-result-with-matlab/204157

理论告诉我们,A*V-lamda*V=0应该满足,但它可能不是完美的零。我的想法是它会非常非常接近 zero or e-14 这样的东西。如果等式给出的值接近于零,那么它是可以接受的。

有不同的求解特征分解的算法,比如雅可比算法,Cholesky 分解...我在 post 中提供的程序使用了基于 [=13= 的函数 cusolverDnCheevd ]. LAPACK 文档告诉它使用 divide and conquer algorithm 来求解 Hermitian 矩阵。这里是link,http://www.netlib.org/lapack/explore-html/d9/de3/group__complex_h_eeigen_ga6084b0819f9642f0db26257e8a3ebd42.html#ga6084b0819f9642f0db26257e8a3ebd42