暂停ffmpeg C ++后音频不会停止录制

audio do not stop recording after pause ffmpeg c++

我正在开发一个记录屏幕和麦克风音频的应用程序。我在条件变量上实现了停止视频和音频线程的暂停功能,并在同一条件变量上通过通知恢复它们。这是在 captureAudio() 中完成的,主要是在 while 中完成的。这种方式适用于 macOS 和 linux,其中我分别使用 avfoudation 和 alsa,但在 windows 上,使用 dshow,在线程等待条件变量时,在暂停期间继续录制音频。有人知道我该如何解决这个问题吗?

#include "ScreenRecorder.h"

using namespace std;

ScreenRecorder::ScreenRecorder() : pauseCapture(false), stopCapture(false), started(false), activeMenu(true) {
    avcodec_register_all();
    avdevice_register_all();

    width = 1920;
    height = 1200;
}

ScreenRecorder::~ScreenRecorder() {

    if (started) {
        value = av_write_trailer(outAVFormatContext);
        if (value < 0) {
            cerr << "Error in writing av trailer" << endl;
            exit(-1);
        }

        avformat_close_input(&inAudioFormatContext);
        if(inAudioFormatContext == nullptr){
            cout << "inAudioFormatContext close successfully" << endl;
        }
        else{
            cerr << "Error: unable to close the inAudioFormatContext" << endl;
            exit(-1);
            //throw "Error: unable to close the file";
        }
        avformat_free_context(inAudioFormatContext);
        if(inAudioFormatContext == nullptr){
            cout << "AudioFormat freed successfully" << endl;
        }
        else{
            cerr << "Error: unable to free AudioFormatContext" << endl;
            exit(-1);
        }
        
        avformat_close_input(&pAVFormatContext);
        if (pAVFormatContext == nullptr) {
            cout << "File close successfully" << endl;
        }
        else {
            cerr << "Error: unable to close the file" << endl;
            exit(-1);
            //throw "Error: unable to close the file";
        }

        avformat_free_context(pAVFormatContext);
        if (pAVFormatContext == nullptr) {
            cout << "VideoFormat freed successfully" << endl;
        }
        else {
            cerr << "Error: unable to free VideoFormatContext" << endl;
            exit(-1);
        }
    }
}

/*==================================== VIDEO ==============================*/

int ScreenRecorder::openVideoDevice() throw() {
    value = 0;
    options = nullptr;
    pAVFormatContext = nullptr;

    pAVFormatContext = avformat_alloc_context();

    string dimension = to_string(width) + "x" + to_string(height);
    av_dict_set(&options, "video_size", dimension.c_str(), 0);   //option to set the dimension of the screen section to record

#ifdef _WIN32
    pAVInputFormat = av_find_input_format("gdigrab");
    if (avformat_open_input(&pAVFormatContext, "desktop", pAVInputFormat, &options) != 0) {
        cerr << "Couldn't open input stream" << endl;
        exit(-1);
    }

#elif defined linux
   
    int offset_x = 0, offset_y = 0;
    string url = ":0.0+" + to_string(offset_x) + "," + to_string(offset_y);  //custom string to set the start point of the screen section
    pAVInputFormat = av_find_input_format("x11grab");
    value = avformat_open_input(&pAVFormatContext, url.c_str(), pAVInputFormat, &options);

    if (value != 0) {
        cerr << "Error in opening input device (video)" << endl;
        exit(-1);
    }
#else

    value = av_dict_set(&options, "pixel_format", "0rgb", 0);
    if (value < 0) {
        cerr << "Error in setting pixel format" << endl;
        exit(-1);
    }

    value = av_dict_set(&options, "video_device_index", "1", 0);

    if (value < 0) {
        cerr << "Error in setting video device index" << endl;
        exit(-1);
    }

    pAVInputFormat = av_find_input_format("avfoundation");

    if (avformat_open_input(&pAVFormatContext, "Capture screen 0:none", pAVInputFormat, &options) != 0) {  //TODO trovare un modo per selezionare sempre lo schermo (forse "Capture screen 0")
        cerr << "Error in opening input device" << endl;
        exit(-1);
    }



#endif
    //set frame per second

    value = av_dict_set(&options, "framerate", "30", 0);
    if (value < 0) {
        cerr << "Error in setting dictionary value (setting framerate)" << endl;
        exit(-1);
    }

    value = av_dict_set(&options, "preset", "medium", 0);
    if (value < 0) {
        cerr << "Error in setting dictionary value (setting preset value)" << endl;
        exit(-1);
    }
    /*
    value = av_dict_set(&options, "vsync", "1", 0);
    if(value < 0){
        cerr << "Error in setting dictionary value (setting vsync value)" << endl;
        exit(-1);
    }
    */

    value = av_dict_set(&options, "probesize", "60M", 0);
    if (value < 0) {
        cerr << "Error in setting probesize value" << endl;
        exit(-1);
    }

    //get video stream infos from context
    value = avformat_find_stream_info(pAVFormatContext, nullptr);
    if (value < 0) {
        cerr << "Error in retrieving the stream info" << endl;
        exit(-1);
    }

    VideoStreamIndx = -1;
    for (int i = 0; i < pAVFormatContext->nb_streams; i++) {
        if (pAVFormatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
            VideoStreamIndx = i;
            break;
        }
    }
    if (VideoStreamIndx == -1) {
        cerr << "Error: unable to find video stream index" << endl;
        exit(-2);
    }

    pAVCodecContext = pAVFormatContext->streams[VideoStreamIndx]->codec;
    pAVCodec = avcodec_find_decoder(pAVCodecContext->codec_id/*params->codec_id*/);
    if (pAVCodec == nullptr) {
        cerr << "Error: unable to find decoder video" << endl;
        exit(-1);
    }

    cout << "Insert height and width [h w]: ";   //custom screen dimension to record
    cin >> h >> w;*/


    return 0;
}

/*==========================================  AUDIO  ============================*/

int ScreenRecorder::openAudioDevice() {
    audioOptions = nullptr;
    inAudioFormatContext = nullptr;

    inAudioFormatContext = avformat_alloc_context();
    value = av_dict_set(&audioOptions, "sample_rate", "44100", 0);
    if (value < 0) {
        cerr << "Error: cannot set audio sample rate" << endl;
        exit(-1);
    }
    value = av_dict_set(&audioOptions, "async", "1", 0);
    if (value < 0) {
        cerr << "Error: cannot set audio sample rate" << endl;
        exit(-1);
    }

#if defined linux
    audioInputFormat = av_find_input_format("alsa");
    value = avformat_open_input(&inAudioFormatContext, "hw:0", audioInputFormat, &audioOptions);
    if (value != 0) {
        cerr << "Error in opening input device (audio)" << endl;
        exit(-1);
    }
#endif

#if defined _WIN32
    audioInputFormat = av_find_input_format("dshow");
    value = avformat_open_input(&inAudioFormatContext, "audio=Microfono (Realtek(R) Audio)", audioInputFormat, &audioOptions);
    if (value != 0) {
        cerr << "Error in opening input device (audio)" << endl;
        exit(-1);
    }
#endif

    value = avformat_find_stream_info(inAudioFormatContext, nullptr);
    if (value != 0) {
        cerr << "Error: cannot find the audio stream information" << endl;
        exit(-1);
    }

    audioStreamIndx = -1;
    for (int i = 0; i < inAudioFormatContext->nb_streams; i++) {
        if (inAudioFormatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            audioStreamIndx = i;
            break;
        }
    }
    if (audioStreamIndx == -1) {
        cerr << "Error: unable to find audio stream index" << endl;
        exit(-2);
    }
}

int ScreenRecorder::initOutputFile() {
    value = 0;

    outAVFormatContext = nullptr;
    outputAVFormat = av_guess_format(nullptr, "output.mp4", nullptr);
    if (outputAVFormat == nullptr) {
        cerr << "Error in guessing the video format, try with correct format" << endl;
        exit(-5);
    }
    avformat_alloc_output_context2(&outAVFormatContext, outputAVFormat, outputAVFormat->name, "..\media\output.mp4");
    if (outAVFormatContext == nullptr) {
        cerr << "Error in allocating outAVFormatContext" << endl;
        exit(-4);
    }

    /*===========================================================================*/
    this->generateVideoStream();
    this->generateAudioStream();

    //create an empty video file
    if (!(outAVFormatContext->flags & AVFMT_NOFILE)) {
        if (avio_open2(&outAVFormatContext->pb, "..\media\output.mp4", AVIO_FLAG_WRITE, nullptr, nullptr) < 0) {
            cerr << "Error in creating the video file" << endl;
            exit(-10);
        }
    }

    if (outAVFormatContext->nb_streams == 0) {
        cerr << "Output file does not contain any stream" << endl;
        exit(-11);
    }
    value = avformat_write_header(outAVFormatContext, &options);
    if (value < 0) {
        cerr << "Error in writing the header context" << endl;
        exit(-12);
    }
    return 0;
}

/*===================================  VIDEO  ==================================*/

void ScreenRecorder::generateVideoStream() {
    //Generate video stream
    videoSt = avformat_new_stream(outAVFormatContext, nullptr);
    if (videoSt == nullptr) {
        cerr << "Error in creating AVFormatStream" << endl;
        exit(-6);
    }

    outVideoCodec = avcodec_find_encoder(AV_CODEC_ID_MPEG4);  //AV_CODEC_ID_MPEG4
    if (outVideoCodec == nullptr) {
        cerr << "Error in finding the AVCodec, try again with the correct codec" << endl;
        exit(-8);
    }
avcodec_alloc_context3(outAVCodec)
    outVideoCodecContext = avcodec_alloc_context3(outVideoCodec);
    if (outVideoCodecContext == nullptr) {
        cerr << "Error in allocating the codec context" << endl;
        exit(-7);
    }

    //set properties of the video file (stream)
    outVideoCodecContext = videoSt->codec;
    outVideoCodecContext->codec_id = AV_CODEC_ID_MPEG4;
    outVideoCodecContext->codec_type = AVMEDIA_TYPE_VIDEO;
    outVideoCodecContext->pix_fmt = AV_PIX_FMT_YUV420P;
    outVideoCodecContext->bit_rate = 10000000;
    outVideoCodecContext->width = width;
    outVideoCodecContext->height = height;
    outVideoCodecContext->gop_size = 10;
    outVideoCodecContext->global_quality = 500;
    outVideoCodecContext->max_b_frames = 2;
    outVideoCodecContext->time_base.num = 1;
    outVideoCodecContext->time_base.den = 30;
    outVideoCodecContext->bit_rate_tolerance = 400000;

    if (outVideoCodecContext->codec_id == AV_CODEC_ID_H264) {
        av_opt_set(outVideoCodecContext->priv_data, "preset", "slow", 0);
    }

    if (outAVFormatContext->oformat->flags & AVFMT_GLOBALHEADER) {
        outVideoCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    value = avcodec_open2(outVideoCodecContext, outVideoCodec, nullptr);
    if (value < 0) {
        cerr << "Error in opening the AVCodec" << endl;
        exit(-9);
    }

    outVideoStreamIndex = -1;
    for (int i = 0; i < outAVFormatContext->nb_streams; i++) {
        if (outAVFormatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_UNKNOWN) {
            outVideoStreamIndex = i;
        }
    }
    if (outVideoStreamIndex < 0) {
        cerr << "Error: cannot find a free stream index for video output" << endl;
        exit(-1);
    }
    avcodec_parameters_from_context(outAVFormatContext->streams[outVideoStreamIndex]->codecpar, outVideoCodecContext);
}

/*===============================  AUDIO  ==================================*/

void ScreenRecorder::generateAudioStream() {
    AVCodecParameters* params = inAudioFormatContext->streams[audioStreamIndx]->codecpar;
    inAudioCodec = avcodec_find_decoder(params->codec_id);
    if (inAudioCodec == nullptr) {
        cerr << "Error: cannot find the audio decoder" << endl;
        exit(-1);
    }

    inAudioCodecContext = avcodec_alloc_context3(inAudioCodec);
    if (avcodec_parameters_to_context(inAudioCodecContext, params) < 0) {
        cout << "Cannot create codec context for audio input" << endl;
    }

    value = avcodec_open2(inAudioCodecContext, inAudioCodec, nullptr);
    if (value < 0) {
        cerr << "Error: cannot open the input audio codec" << endl;
        exit(-1);
    }

    //Generate audio stream
    outAudioCodecContext = nullptr;
    outAudioCodec = nullptr;
    int i;

    AVStream* audio_st = avformat_new_stream(outAVFormatContext, nullptr);
    if (audio_st == nullptr) {
        cerr << "Error: cannot create audio stream" << endl;
        exit(1);
    }

    outAudioCodec = avcodec_find_encoder(AV_CODEC_ID_AAC);
    if (outAudioCodec == nullptr) {
        cerr << "Error: cannot find requested encoder" << endl;
        exit(1);
    }

    outAudioCodecContext = avcodec_alloc_context3(outAudioCodec);
    if (outAudioCodecContext == nullptr) {
        cerr << "Error: cannot create related VideoCodecContext" << endl;
        exit(1);
    }

    if ((outAudioCodec)->supported_samplerates) {
        outAudioCodecContext->sample_rate = (outAudioCodec)->supported_samplerates[0];
        for (i = 0; (outAudioCodec)->supported_samplerates[i]; i++) {
            if ((outAudioCodec)->supported_samplerates[i] == inAudioCodecContext->sample_rate)
                outAudioCodecContext->sample_rate = inAudioCodecContext->sample_rate;
        }
    }
    outAudioCodecContext->codec_id = AV_CODEC_ID_AAC;
    outAudioCodecContext->sample_fmt = (outAudioCodec)->sample_fmts ? (outAudioCodec)->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
    outAudioCodecContext->channels = inAudioCodecContext->channels;
    outAudioCodecContext->channel_layout = av_get_default_channel_layout(outAudioCodecContext->channels);
    outAudioCodecContext->bit_rate = 96000;
    outAudioCodecContext->time_base = { 1, inAudioCodecContext->sample_rate };

    outAudioCodecContext->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;

    if ((outAVFormatContext)->oformat->flags & AVFMT_GLOBALHEADER) {
        outAudioCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    if (avcodec_open2(outAudioCodecContext, outAudioCodec, nullptr) < 0) {
        cerr << "error in opening the avcodec" << endl;
        exit(1);
    }

    //find a free stream index
    outAudioStreamIndex = -1;
    for (i = 0; i < outAVFormatContext->nb_streams; i++) {
        if (outAVFormatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_UNKNOWN) {
            outAudioStreamIndex = i;
        }
    }
    if (outAudioStreamIndex < 0) {
        cerr << "Error: cannot find a free stream for audio on the output" << endl;
        exit(1);
    }

    avcodec_parameters_from_context(outAVFormatContext->streams[outAudioStreamIndex]->codecpar, outAudioCodecContext);
}

int ScreenRecorder::init_fifo()
{
    /* Create the FIFO buffer based on the specified output sample format. */
    if (!(fifo = av_audio_fifo_alloc(outAudioCodecContext->sample_fmt,
        outAudioCodecContext->channels, 1))) {
        fprintf(stderr, "Could not allocate FIFO\n");
        return AVERROR(ENOMEM);
    }
    return 0;
}

int ScreenRecorder::add_samples_to_fifo(uint8_t** converted_input_samples, const int frame_size) {
    int error;
    /* Make the FIFO as large as it needs to be to hold both,
     * the old and the new samples. */
    if ((error = av_audio_fifo_realloc(fifo, av_audio_fifo_size(fifo) + frame_size)) < 0) {
        fprintf(stderr, "Could not reallocate FIFO\n");
        return error;
    }
    /* Store the new samples in the FIFO buffer. */
    if (av_audio_fifo_write(fifo, (void**)converted_input_samples, frame_size) < frame_size) {
        fprintf(stderr, "Could not write data to FIFO\n");
        return AVERROR_EXIT;
    }
    return 0;
}

int ScreenRecorder::initConvertedSamples(uint8_t*** converted_input_samples,
    AVCodecContext* output_codec_context,
    int frame_size) {
    int error;
    /* Allocate as many pointers as there are audio channels.
     * Each pointer will later point to the audio samples of the corresponding
     * channels (although it may be NULL for interleaved formats).
     */
    if (!(*converted_input_samples = (uint8_t**)calloc(output_codec_context->channels,
        sizeof(**converted_input_samples)))) {
        fprintf(stderr, "Could not allocate converted input sample pointers\n");
        return AVERROR(ENOMEM);
    }
    /* Allocate memory for the samples of all channels in one consecutive
     * block for convenience. */
    if (av_samples_alloc(*converted_input_samples, nullptr,
        output_codec_context->channels,
        frame_size,
        output_codec_context->sample_fmt, 0) < 0) {

        exit(1);
    }
    return 0;
}

static int64_t pts = 0;
void ScreenRecorder::captureAudio() {
    int ret;
    AVPacket* inPacket, * outPacket;
    AVFrame* rawFrame, * scaledFrame;
    uint8_t** resampledData;

    init_fifo();

    //allocate space for a packet
    inPacket = (AVPacket*)av_malloc(sizeof(AVPacket));
    if (!inPacket) {
        cerr << "Cannot allocate an AVPacket for encoded video" << endl;
        exit(1);
    }
    av_init_packet(inPacket);

    //allocate space for a packet
    rawFrame = av_frame_alloc();
    if (!rawFrame) {
        cerr << "Cannot allocate an AVPacket for encoded video" << endl;
        exit(1);
    }

    scaledFrame = av_frame_alloc();
    if (!scaledFrame) {
        cerr << "Cannot allocate an AVPacket for encoded video" << endl;
        exit(1);
    }

    outPacket = (AVPacket*)av_malloc(sizeof(AVPacket));
    if (!outPacket) {
        cerr << "Cannot allocate an AVPacket for encoded video" << endl;
        exit(1);
    }

    //init the resampler
    SwrContext* resampleContext = nullptr;
    resampleContext = swr_alloc_set_opts(resampleContext,
        av_get_default_channel_layout(outAudioCodecContext->channels),
        outAudioCodecContext->sample_fmt,
        outAudioCodecContext->sample_rate,
        av_get_default_channel_layout(inAudioCodecContext->channels),
        inAudioCodecContext->sample_fmt,
        inAudioCodecContext->sample_rate,
        0,
        nullptr);
    if (!resampleContext) {
        cerr << "Cannot allocate the resample context" << endl;
        exit(1);
    }
    if ((swr_init(resampleContext)) < 0) {
        fprintf(stderr, "Could not open resample context\n");
        swr_free(&resampleContext);
        exit(1);
    }

    while (true) {
        if (pauseCapture) {
            cout << "Pause audio" << endl;
        }
        cv.wait(ul, [this]() { return !pauseCapture; });

        if (stopCapture) {
            break;
        }

        ul.unlock();

        if (av_read_frame(inAudioFormatContext, inPacket) >= 0 && inPacket->stream_index == audioStreamIndx) {
            //decode audio routing
            av_packet_rescale_ts(outPacket, inAudioFormatContext->streams[audioStreamIndx]->time_base, inAudioCodecContext->time_base);
            if ((ret = avcodec_send_packet(inAudioCodecContext, inPacket)) < 0) {
                cout << "Cannot decode current audio packet " << ret << endl;
                continue;
            }
            
            while (ret >= 0) {
                ret = avcodec_receive_frame(inAudioCodecContext, rawFrame);
                if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
                    break;
                else if (ret < 0) {
                    cerr << "Error during decoding" << endl;
                    exit(1);
                }
                if (outAVFormatContext->streams[outAudioStreamIndex]->start_time <= 0) {
                    outAVFormatContext->streams[outAudioStreamIndex]->start_time = rawFrame->pts;
                }
                initConvertedSamples(&resampledData, outAudioCodecContext, rawFrame->nb_samples);

                swr_convert(resampleContext,
                    resampledData, rawFrame->nb_samples,
                    (const uint8_t**)rawFrame->extended_data, rawFrame->nb_samp

                add_samples_to_fifo(resampledData, rawFrame->nb_samples);

                //raw frame ready
                av_init_packet(outPacket);
                outPacket->data = nullptr;
                outPacket->size = 0;

                const int frame_size = FFMAX(av_audio_fifo_size(fifo), outAudioCodecContext->frame_size);

                scaledFrame = av_frame_alloc();
                if (!scaledFrame) {
                    cerr << "Cannot allocate an AVPacket for encoded video" << endl;
                    exit(1);
                }

                scaledFrame->nb_samples = outAudioCodecContext->frame_size;
                scaledFrame->channel_layout = outAudioCodecContext->channel_layout;
                scaledFrame->format = outAudioCodecContext->sample_fmt;
                scaledFrame->sample_rate = outAudioCodecContext->sample_rate;
                av_frame_get_buffer(scaledFrame, 0);

                while (av_audio_fifo_size(fifo) >= outAudioCodecContext->frame_size) {

                    ret = av_audio_fifo_read(fifo, (void**)(scaledFrame->data), outAudioCodecContext->frame_size);
                    scaledFrame->pts = pts;
                    pts += scaledFrame->nb_samples;
                    if (avcodec_send_frame(outAudioCodecContext, scaledFrame) < 0) {
                        cout << "Cannot encode current audio packet " << endl;
                        exit(1);
                    }
                    while (ret >= 0) {
                        ret = avcodec_receive_packet(outAudioCodecContext, outPacket);
                        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
                            break;
                        else if (ret < 0) {
                            cerr << "Error during encoding" << endl;
                            exit(1);
                        }
                        av_packet_rescale_ts(outPacket, outAudioCodecContext->time_base, outAVFormatContext->streams[outAudioStreamIndex]->time_base);

                        outPacket->stream_index = outAudioStreamIndex;

                        write_lock.lock();
                        
                        if (av_write_frame(outAVFormatContext, outPacket) != 0)
                        {
                            cerr << "Error in writing audio frame" << endl;
                        }
                        write_lock.unlock();
                        av_packet_unref(outPacket);
                    }
                    ret = 0;
                }
                av_frame_free(&scaledFrame);
                av_packet_unref(outPacket);
            }
        }
    }
}

int ScreenRecorder::captureVideoFrames() {
    int64_t pts = 0;
    int flag;
    int frameFinished = 0;
    bool endPause = false;
    int numPause = 0;

    ofstream outFile{ "..\media\log.txt", ios::out };

    int frameIndex = 0;
    value = 0;

    pAVPacket = (AVPacket*)av_malloc(sizeof(AVPacket));
    if (pAVPacket == nullptr) {
        cerr << "Error in allocating AVPacket" << endl;
        exit(-1);
    }

    pAVFrame = av_frame_alloc();
    if (pAVFrame == nullptr) {
        cerr << "Error: unable to alloc the AVFrame resources" << endl;
        exit(-1);
    }

    outFrame = av_frame_alloc();
    if (outFrame == nullptr) {
        cerr << "Error: unable to alloc the AVFrame resources for out frame" << endl;
        exit(-1);
    }

    int videoOutBuffSize;
    int nBytes = av_image_get_buffer_size(outVideoCodecContext->pix_fmt, outVideoCodecContext->width, outVideoCodecContext->height, 32);
    uint8_t* videoOutBuff = (uint8_t*)av_malloc(nBytes);

    if (videoOutBuff == nullptr) {
        cerr << "Error: unable to allocate memory" << endl;
        exit(-1);
    }

    value = av_image_fill_arrays(outFrame->data, outFrame->linesize, videoOutBuff, AV_PIX_FMT_YUV420P, outVideoCodecContext->width, outVideoCodecContext->height, 1);
    if (value < 0) {
        cerr << "Error in filling image array" << endl;
    }

    SwsContext* swsCtx_;
    if (avcodec_open2(pAVCodecContext, pAVCodec, nullptr) < 0) {
        cerr << "Could not open codec" << endl;
        exit(-1);
    }
    swsCtx_ = sws_getContext(pAVCodecContext->width, pAVCodecContext->height, pAVCodecContext->pix_fmt, outVideoCodecContext->width, outVideoCodecContext->height, outVideoCodecContext->pix_fmt, SWS_BICUBIC,
        nullptr, nullptr, nullptr);

    AVPacket outPacket;
    int gotPicture;

    time_t startTime;
    time(&startTime);

    while (true) {

        if (pauseCapture) {
            cout << "Pause" << endl;
            outFile << "///////////////////   Pause  ///////////////////" << endl;
            cout << "outVideoCodecContext->time_base: " << outVideoCodecContext->time_base.num << ", " << outVideoCodecContext->time_base.den << endl;
        }
        cv.wait(ul, [this]() { return !pauseCapture; });   //pause capture (not busy waiting)
        if (endPause) {
            endPause = false;
        }

        if (stopCapture)  //check if the capture has to stop
            break;
        ul.unlock();

        if (av_read_frame(pAVFormatContext, pAVPacket) >= 0 && pAVPacket->stream_index == VideoStreamIndx) {
            av_packet_rescale_ts(pAVPacket, pAVFormatContext->streams[VideoStreamIndx]->time_base, pAVCodecContext->time_base);
            value = avcodec_decode_video2(pAVCodecContext, pAVFrame, &frameFinished, pAVPacket);
            if (value < 0) {
                cout << "Unable to decode video" << endl;
            }

            if (frameFinished) { //frame successfully decoded
                //sws_scale(swsCtx_, pAVFrame->data, pAVFrame->linesize, 0, pAVCodecContext->height, outFrame->data, outFrame->linesize);
                av_init_packet(&outPacket);
                outPacket.data = nullptr;
                outPacket.size = 0;

                if (outAVFormatContext->streams[outVideoStreamIndex]->start_time <= 0) {
                    outAVFormatContext->streams[outVideoStreamIndex]->start_time = pAVFrame->pts;
                }

                //disable warning on the console
                outFrame->width = outVideoCodecContext->width;
                outFrame->height = outVideoCodecContext->height;
                outFrame->format = outVideoCodecContext->pix_fmt;

                sws_scale(swsCtx_, pAVFrame->data, pAVFrame->linesize, 0, pAVCodecContext->height, outFrame->data, outFrame->linesize);

                avcodec_encode_video2(outVideoCodecContext, &outPacket, outFrame, &gotPicture);

                if (gotPicture) {
                    if (outPacket.pts != AV_NOPTS_VALUE) {
                        outPacket.pts = av_rescale_q(outPacket.pts, videoSt->codec->time_base, videoSt->time_base);
                    }
                    if (outPacket.dts != AV_NOPTS_VALUE) {
                        outPacket.dts = av_rescale_q(outPacket.dts, videoSt->codec->time_base, videoSt->time_base);
                    }

                    //cout << "Write frame " << j++ << " (size = " << outPacket.size / 1000 << ")" << endl;
                    //cout << "(size = " << outPacket.size << ")" << endl;

                    //av_packet_rescale_ts(&outPacket, outVideoCodecContext->time_base, outAVFormatContext->streams[outVideoStreamIndex]->time_base);
                    //outPacket.stream_index = outVideoStreamIndex;

                    outFile << "outPacket->duration: " << outPacket.duration << ", " << "pAVPacket->duration: " << pAVPacket->duration << endl;
                    outFile << "outPacket->pts: " << outPacket.pts << ", " << "pAVPacket->pts: " << pAVPacket->pts << endl;
                    outFile << "outPacket.dts: " << outPacket.dts << ", " << "pAVPacket->dts: " << pAVPacket->dts << endl;

                    time_t timer;
                    double seconds;

                    mu.lock();
                    if (!activeMenu) {
                        time(&timer);
                        seconds = difftime(timer, startTime);
                        int h = (int)(seconds / 3600);
                        int m = (int)(seconds / 60) % 60;
                        int s = (int)(seconds) % 60;

                        std::cout << std::flush << "\r" << std::setw(2) << std::setfill('0') << h << ':'
                            << std::setw(2) << std::setfill('0') << m << ':'
                            << std::setw(2) << std::setfill('0') << s << std::flush;
                    }
                    mu.unlock();

                    write_lock.lock();
                    if (av_write_frame(outAVFormatContext, &outPacket) != 0) {
                        cerr << "Error in writing video frame" << endl;
                    }
                    write_lock.unlock();
                    av_packet_unref(&outPacket);
                }

                av_packet_unref(&outPacket);
                av_free_packet(pAVPacket);  //avoid memory saturation
            }
        }
    }

    outFile.close();

    av_free(videoOutBuff);

    return 0;
}

我解决了这个问题,在进入暂停前执行 avformat_close_input(&inAudioFormatContext),在恢复录制后执行 avformat_open_input(&inAudioFormatContext, "audio=Microfono (Realtek(R) Audio)", audioInputFormat, &audioOptions)。通过这种方式,最终文件似乎与视频同步得很好。