使用 C++ CLI 时英特尔 OneAPI 视频解码内存泄漏

Intel OneAPI Video decoding memory leak when using C++ CLI

我正在尝试使用 Intel OneAPI/OneVPL 来解码我从 C# 中的 RTSP 摄像头接收到的流。但是当我 运行 代码时,我得到了巨大的内存泄漏。每 运行 大约 1-200MB,大约每秒一次。 当我从我知道第一个数据是关键帧的相机收集 GoP 时,我将它作为字节数组传递给我的 CLI 和 C++ 代码。 在这里,我希望它解码所有帧和 return 解码图像。它接收 30 帧和 returns 16 个解码图像,但有内存泄漏。

我已经尝试使用 Visual Studio 内存分析器,我只能从中得知它的非托管内存是我的问题。我试图覆盖 videoHandler.cpp 中的“new”和“delete”方法来跟踪和比较所有分配和解除分配,据我所知,那里的所有内容都已正确处理。我看不到任何未被清理的实例化的 classes。我认为我的问题出在 CLI class videoHandlerWrapper.cpp 中。我是否漏掉了一些明显的东西?

videoHandlerWrapper.cpp

array<imgFrameWrapper^>^ videoHandlerWrapper::decode(array<System::Byte>^ byteArray)
{
    array<imgFrameWrapper^>^ returnFrames = gcnew array<imgFrameWrapper^>(30);
    {
        std::vector<imgFrame> frames(30); //Output from decoding process. imgFrame implements a deconstructor that will rid the data when exiting scope

        std::vector<unsigned char> bytes(byteArray->Length); //Input for decoding process
        Marshal::Copy(byteArray, 0, IntPtr((unsigned char*)(&((bytes)[0]))), byteArray->Length); //Copy from managed (C#) to unmanaged (C++)

        int status = _pVideoHandler->decode(bytes, frames); //Decode

        for (size_t i = 0; i < frames.size(); i++)
        {
            if (frames[i].size > 0)
                returnFrames[i] = gcnew imgFrameWrapper(frames[i].size, frames[i].bytes);
        }
    }
    //PrintMemoryUsage();
    return returnFrames;
}

videoHandler.cpp

#define BITSTREAM_BUFFER_SIZE 2000000 //TODO Maybe higher or lower bitstream buffer. Thorough testing has been done at 2000000
int videoHandler::decode(std::vector<unsigned char> bytes, std::vector<imgFrame> &frameData)
{
    int result = -1;
    bool isStillGoing = true;
    mfxBitstream bitstream = { 0 };
    mfxSession session = NULL;
    mfxStatus sts = MFX_ERR_NONE;
    mfxSurfaceArray* outSurfaces = nullptr;
    mfxU32 framenum = 0;
    mfxU32 numVPPCh = 0;
    mfxVideoChannelParam* mfxVPPChParams = nullptr;
    void* accelHandle = NULL;
    mfxVideoParam mfxDecParams = {};
    mfxVersion version = { 0, 1 };

    //variables used only in 2.x version
    mfxConfig cfg = NULL;
    mfxLoader loader = NULL;
    mfxVariant inCodec = {};
    std::vector<mfxU8> input_buffer;

    // Initialize VPL session for any implementation of HEVC/H265 decode
    loader = MFXLoad();
    VERIFY(NULL != loader, "MFXLoad failed -- is implementation in path?");

    cfg = MFXCreateConfig(loader);
    VERIFY(NULL != cfg, "MFXCreateConfig failed")

        inCodec.Type = MFX_VARIANT_TYPE_U32;
    inCodec.Data.U32 = MFX_CODEC_AVC;
    sts = MFXSetConfigFilterProperty(
        cfg,
        (mfxU8*)"mfxImplDescription.mfxDecoderDescription.decoder.CodecID",
        inCodec);
    
    VERIFY(MFX_ERR_NONE == sts, "MFXSetConfigFilterProperty failed for decoder CodecID");

    sts = MFXCreateSession(loader, 0, &session);
    VERIFY(MFX_ERR_NONE == sts, "Not able to create VPL session");
    
    // Print info about implementation loaded
    version = ShowImplInfo(session);
    //VERIFY(version.Major > 1, "Sample requires 2.x API implementation, exiting");
    if (version.Major == 1) {
        mfxVariant ImplValueSW;
        ImplValueSW.Type = MFX_VARIANT_TYPE_U32;
        ImplValueSW.Data.U32 = MFX_IMPL_TYPE_SOFTWARE;
        MFXSetConfigFilterProperty(cfg, (mfxU8*)"mfxImplDescription.Impl", ImplValueSW);
        sts = MFXCreateSession(loader, 0, &session);
        VERIFY(MFX_ERR_NONE == sts, "Not able to create VPL session");
    }
    // Convenience function to initialize available accelerator(s)
    accelHandle = InitAcceleratorHandle(session);

    bitstream.MaxLength = BITSTREAM_BUFFER_SIZE;

    bitstream.Data = (mfxU8*)calloc(bytes.size(), sizeof(mfxU8));
    VERIFY(bitstream.Data, "Not able to allocate input buffer");

    bitstream.CodecId = MFX_CODEC_AVC;

    std::copy(bytes.begin(), bytes.end(), bitstream.Data);

    bitstream.DataLength = static_cast<mfxU32>(bytes.size());

    memset(&mfxDecParams, 0, sizeof(mfxDecParams));

    mfxDecParams.mfx.CodecId = MFX_CODEC_AVC;
    mfxDecParams.IOPattern = MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
    sts = MFXVideoDECODE_DecodeHeader(session, &bitstream, &mfxDecParams);
    VERIFY(MFX_ERR_NONE == sts, "Error decoding header\n");

    numVPPCh = 1;
    mfxVPPChParams = new mfxVideoChannelParam[numVPPCh];
    for (mfxU32 i = 0; i < numVPPCh; i++) {
        mfxVPPChParams[i] = {};
    }

    //mfxVPPChParams[0].VPP.FourCC = mfxDecParams.mfx.FrameInfo.FourCC;
    mfxVPPChParams[0].VPP.FourCC = MFX_FOURCC_BGRA;
    mfxVPPChParams[0].VPP.ChromaFormat = MFX_CHROMAFORMAT_YUV420;
    mfxVPPChParams[0].VPP.PicStruct = MFX_PICSTRUCT_PROGRESSIVE;
    mfxVPPChParams[0].VPP.FrameRateExtN = 30;
    mfxVPPChParams[0].VPP.FrameRateExtD = 1;
    mfxVPPChParams[0].VPP.CropW = 1920;
    mfxVPPChParams[0].VPP.CropH = 1080;
    //Set value directly if input and output is the same.
    mfxVPPChParams[0].VPP.Width = 1920;
    mfxVPPChParams[0].VPP.Height = 1080;
    //// USED TO RESIZE. IF INPUT IS THE SAME AS OUTPUT THIS WILL MAKE IT SHIFT A BIT. 1920x1080 becomes 1920x1088.
    //mfxVPPChParams[0].VPP.Width = ALIGN16(mfxVPPChParams[0].VPP.CropW);
    //mfxVPPChParams[0].VPP.Height = ALIGN16(mfxVPPChParams[0].VPP.CropH);  
    mfxVPPChParams[0].VPP.ChannelId = 1;
    mfxVPPChParams[0].Protected = 0;
    mfxVPPChParams[0].IOPattern = MFX_IOPATTERN_IN_SYSTEM_MEMORY | MFX_IOPATTERN_OUT_SYSTEM_MEMORY;
    mfxVPPChParams[0].ExtParam = NULL;
    mfxVPPChParams[0].NumExtParam = 0;

    sts = MFXVideoDECODE_VPP_Init(session, &mfxDecParams, &mfxVPPChParams, numVPPCh); //This causes a MINOR memory leak! 
    
    outSurfaces = new mfxSurfaceArray;

    while (isStillGoing == true) {
        sts = MFXVideoDECODE_VPP_DecodeFrameAsync(session,
            &bitstream,
            NULL,
            0,
            &outSurfaces); //Big memory leak. 100MB pr run in the while loop.

        switch (sts) {
        case MFX_ERR_NONE:
            // decode output
            if (framenum >= 30)
            {
                isStillGoing = false;
                break;
            }

            sts = WriteRawFrameToByte(outSurfaces->Surfaces[1], &frameData[framenum]);
            VERIFY(MFX_ERR_NONE == sts, "Could not write 1st vpp output");

            framenum++;
            break;
        case MFX_ERR_MORE_DATA:
            // The function requires more bitstream at input before decoding can proceed           
            isStillGoing = false;
            break;
        case MFX_ERR_MORE_SURFACE:
            // The function requires more frame surface at output before decoding can proceed.
            // This applies to external memory allocations and should not be expected for
            // a simple internal allocation case like this
            break;
        case MFX_ERR_DEVICE_LOST:
            // For non-CPU implementations,
            // Cleanup if device is lost
            break;
        case MFX_WRN_DEVICE_BUSY:
            // For non-CPU implementations,
            // Wait a few milliseconds then try again
            break;
        case MFX_WRN_VIDEO_PARAM_CHANGED:
            // The decoder detected a new sequence header in the bitstream.
            // Video parameters may have changed.
            // In external memory allocation case, might need to reallocate the output surface
            break;
        case MFX_ERR_INCOMPATIBLE_VIDEO_PARAM:
            // The function detected that video parameters provided by the application
            // are incompatible with initialization parameters.
            // The application should close the component and then reinitialize it
            break;
        case MFX_ERR_REALLOC_SURFACE:
            // Bigger surface_work required. May be returned only if
            // mfxInfoMFX::EnableReallocRequest was set to ON during initialization.
            // This applies to external memory allocations and should not be expected for
            // a simple internal allocation case like this
            break;
        default:
            printf("unknown status %d\n", sts);
            isStillGoing = false;
            break;
        }
    }

    sts = MFXVideoDECODE_VPP_Close(session);  // Helps massively! Halves the memory leak speed. Closes internal structures and tables.
    VERIFY(MFX_ERR_NONE == sts, "Error closing VPP session\n");
    
    result = 0;
end:
    printf("Decode and VPP processed %d frames\n", framenum);

    // Clean up resources - It is recommended to close components first, before
    // releasing allocated surfaces, since some surfaces may still be locked by
    // internal resources.        

    if (mfxVPPChParams)
        delete[] mfxVPPChParams;

    if (outSurfaces)
        delete outSurfaces;

    if (bitstream.Data)
        free(bitstream.Data);

    if (accelHandle)
        FreeAcceleratorHandle(accelHandle);

    if (loader)
        MFXUnload(loader);

    return result;
} 

imgFrameWrapper.h

public ref class imgFrameWrapper
    {
    private:
        size_t size;
        array<System::Byte>^ bytes;
    public:
        imgFrameWrapper(size_t u_size, unsigned char* u_bytes);
        ~imgFrameWrapper();
        !imgFrameWrapper();

        size_t get_size();
        array<System::Byte>^ get_bytes();
    };

imgFrameWrapper.cpp

imgFrameWrapper::imgFrameWrapper(size_t u_size, unsigned char* u_bytes)
{
    size = u_size;
    bytes = gcnew array<System::Byte>(size);    
    Marshal::Copy((IntPtr)u_bytes, bytes, 0, size); 
}
imgFrameWrapper::~imgFrameWrapper()
{   
}
imgFrameWrapper::!imgFrameWrapper()
{   
}
size_t imgFrameWrapper::get_size()
{
    return size;
}
array<System::Byte>^ imgFrameWrapper::get_bytes()
{
    return bytes;
}

imgFrame.h

struct imgFrame
{
    int size;
    unsigned char* bytes;

    ~imgFrame()
    {
        if (bytes)
            delete[] bytes;
    }
};

MFXVideoDECODE_VPP_DecodeFrameAsync() 函数为处理创建内部内存表面。 你应该释放表面。 请检查它提到的 link。

https://spec.oneapi.com/onevpl/latest/API_ref/VPL_structs_decode_vpp.html#_CPPv415mfxSurfaceArray

   mfxStatus (*Release)(struct mfxSurfaceArray *surface_array)¶
   Decrements the internal reference counter of the surface. (*Release) should be 
   called after using the (*AddRef) function to add a surface or when allocation 
   logic requires it.

并请检查此示例。 https://github.com/oneapi-src/oneVPL/blob/master/examples/hello-decvpp/src/hello-decvpp.cpp

特别是WriteRawFrame_InternalMem()函数在https://github.com/oneapi-src/oneVPL/blob/17968d8d2299352f5a9e09388d24e81064c81c87/examples/util/util/util.h

它展示了如何释放曲面。