使用 FFMPEG LibAV 重采样音频

Question

好吧，因为 FFMPEG 文档和代码示例绝对是垃圾，我想我唯一的选择就是到这里来 aks。

所以我想做的只是从麦克风录制音频并将其写入文件。所以我初始化我的输入和输出格式，我得到一个音频数据包对其进行解码、重新采样、编码和写入。但每次我尝试播放和播放音频时，都只有一个数据存根。似乎出于某种原因它只写了一个起始数据包。这仍然很奇怪，让我解释一下原因：

if((response = swr_config_frame(resampleContext, audioOutputFrame, frame) < 0)) qDebug() << "can't configure frame!" <<  av_make_error(response);

if((response = swr_convert_frame(resampleContext, audioOutputFrame, frame) < 0)) qDebug() << "can't resample frame!" <<  av_make_error(response);

这是我用来重新采样的代码。我的 frame 有数据，但 swr_convert_frame 将空数据写入 audioOutputFrame

我该如何解决？ FFMPEG 真的快把我逼疯了。

这是我的 class

的完整代码

VideoReader.h

#ifndef VIDEOREADER_H
#define VIDEOREADER_H

extern "C"
{
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavdevice/avdevice.h>
#include "libavutil/audio_fifo.h"
#include "libavformat/avio.h"
#include "libswresample/swresample.h"
#include <inttypes.h>
}

#include <QString>
#include <QElapsedTimer>

class VideoReader
{
public:
    VideoReader();

    bool open(const char* filename);
    bool fillFrame();
    bool readFrame(uint8_t *&frameData);
    void close();

    int width, height;

private:
    bool configInput();
    bool configOutput(const char *filename);
    bool configResampler();

    bool encode(AVFrame *frame, AVCodecContext *encoderContext, AVPacket *outputPacket, int streamIndex, QString type);

    int audioStreamIndex = -1;
    int videoStreamIndex = -1;

    int64_t videoStartPts = 0;
    int64_t audioStartPts = 0;

    AVFormatContext* inputFormatContext = nullptr;
    AVFormatContext* outputFormatContext = nullptr;

    AVCodecContext* videoDecoderContext = nullptr;
    AVCodecContext* videoEncoderContext = nullptr;

    AVCodecContext* audioDecoderContext = nullptr;
    AVCodecContext* audioEncoderContext = nullptr;

    AVFrame* videoInputFrame = nullptr;
    AVFrame* audioInputFrame = nullptr;

    AVFrame* videoOutputFrame = nullptr;
    AVFrame* audioOutputFrame = nullptr;

    AVPacket* inputPacket = nullptr;

    AVPacket* videoOutputPacket = nullptr;
    AVPacket* audioOutputPacket = nullptr;

    SwsContext* innerScaleContext = nullptr;
    SwsContext* outerScaleContext = nullptr;

    SwrContext *resampleContext = nullptr;
};

#endif // VIDEOREADER_H

VideoReader.cpp

#include "VideoReader.h"

#include <QDebug>

static const char* av_make_error(int errnum)
{
    static char str[AV_ERROR_MAX_STRING_SIZE];
    memset(str, 0, sizeof(str));
    return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum);
}

VideoReader::VideoReader()
{

}

bool VideoReader::open(const char *filename)
{
    if(!configInput()) return false;
    if(!configOutput(filename)) return false;
    if(!configResampler()) return false;

    return true;
}

bool VideoReader::fillFrame()
{
    auto convertToYUV = [=](AVFrame* frame)
    {
        int response = 0;

        if((response = sws_scale(outerScaleContext, frame->data, frame->linesize, 0, videoEncoderContext->height, videoOutputFrame->data, videoOutputFrame->linesize)) < 0) qDebug() << "can't rescale" << av_make_error(response);
    };

    auto convertAudio = [this](AVFrame* frame)
    {
        int response = 0;

        auto& out = audioOutputFrame;
        qDebug() << out->linesize[0] << out->nb_samples;
        if((response = swr_convert_frame(resampleContext, audioOutputFrame, frame)) < 0) qDebug() << "can't resample frame!" << av_make_error(response);
        qDebug() << "poop";
    };

    auto decodeEncode = [=](AVPacket* inputPacket, AVFrame* inputFrame, AVCodecContext* decoderContext,
                            AVPacket* outputPacket, AVFrame* outputFrame, AVCodecContext* encoderContext,
                            std::function<void (AVFrame*)> convertFunc,
                            int streamIndex, int64_t startPts, QString type)
    {
        int response = avcodec_send_packet(decoderContext, inputPacket);
        if(response < 0) { qDebug() << "failed to send" << type << "packet!" <<  av_make_error(response); return false; }

        response = avcodec_receive_frame(decoderContext, inputFrame);
        if(response == AVERROR(EAGAIN) || response == AVERROR_EOF) { av_packet_unref(inputPacket); return false; }
        else if (response < 0) { qDebug() << "failed to decode" << type << "frame!" << response << av_make_error(response); return false; }

        if(encoderContext)
        {
            outputFrame->pts = inputPacket->pts - startPts;

            convertFunc(inputFrame);
            if(!encode(outputFrame, encoderContext, outputPacket, streamIndex, type)) return false;
        }

        av_packet_unref(inputPacket);

        return true;
    };

    while(av_read_frame(inputFormatContext, inputPacket) >= 0) //actually read packet
    {
        if(inputPacket->stream_index == videoStreamIndex)
        {
            if(!videoStartPts) videoStartPts = inputPacket->pts;
            if(decodeEncode(inputPacket, videoInputFrame, videoDecoderContext, videoOutputPacket, videoOutputFrame, videoEncoderContext, convertToYUV, videoStreamIndex, videoStartPts, "video")) break;
        }
        else if(inputPacket->stream_index == audioStreamIndex)
        {
            if(!audioStartPts) audioStartPts = inputPacket->pts;
            if(decodeEncode(inputPacket, audioInputFrame, audioDecoderContext, audioOutputPacket, audioOutputFrame, audioEncoderContext, convertAudio, audioStreamIndex, audioStartPts, "audio")) break;
        }
    }

    return true;
}

bool VideoReader::readFrame(uint8_t *&frameData)
{
    if(!fillFrame()) { qDebug() << "readFrame method failed!"; return false; };

    const int bytesPerPixel = 4;

    uint8_t* destination[bytesPerPixel] = {frameData, NULL, NULL, NULL};
    int destinationLinesize[bytesPerPixel] = { videoInputFrame->width * bytesPerPixel,  0, 0, 0};

    sws_scale(innerScaleContext, videoInputFrame->data, videoInputFrame->linesize, 0, videoInputFrame->height, destination, destinationLinesize);

    return true;
}

void VideoReader::close()
{
    encode(NULL, videoEncoderContext, videoOutputPacket, videoStreamIndex, "video");
    encode(NULL, audioEncoderContext, audioOutputPacket, audioStreamIndex, "audio");

    if(av_write_trailer(outputFormatContext) < 0) { qDebug() << "failed to write trailer"; };

    avformat_close_input(&outputFormatContext);
    avformat_free_context(outputFormatContext);
    avformat_close_input(&inputFormatContext);
    avformat_free_context(inputFormatContext);

    av_frame_free(&videoInputFrame);
    av_frame_free(&audioInputFrame);

    av_frame_free(&videoOutputFrame);
    av_frame_free(&audioOutputFrame);

    av_packet_free(&inputPacket);

    av_packet_free(&videoOutputPacket);
    av_packet_free(&audioOutputPacket);

    avcodec_free_context(&videoDecoderContext);
    avcodec_free_context(&videoEncoderContext);

    avcodec_free_context(&audioDecoderContext);
    avcodec_free_context(&audioEncoderContext);

    sws_freeContext(innerScaleContext);
    sws_freeContext(outerScaleContext);

    swr_free(&resampleContext);
}

bool VideoReader::configInput()
{
    avdevice_register_all();

    inputFormatContext = avformat_alloc_context();

    if(!inputFormatContext) { qDebug() << "can't create context!"; return false; }

    const char* inputFormatName = "dshow";/*"gdigrab"*/
    AVInputFormat* inputFormat = av_find_input_format(inputFormatName);

    if(!inputFormat){ qDebug() << "Can't find" << inputFormatName; return false; }

    AVDictionary* options = NULL;
    av_dict_set(&options, "framerate", "30", 0);
    av_dict_set(&options, "video_size", "1920x1080", 0);

    if(avformat_open_input(&inputFormatContext, "video=HD USB Camera:audio=Microphone (High Definition Audio Device)" /*"desktop"*/, inputFormat, &options) != 0) { qDebug() << "can't open video file!"; return false; }

    AVCodecParameters* videoCodecParams = nullptr;
    AVCodecParameters* audioCodecParams = nullptr;
    AVCodec* videoDecoder = nullptr;
    AVCodec* audioDecoder = nullptr;

    for (uint i = 0; i < inputFormatContext->nb_streams; ++i)
    {
        auto stream = inputFormatContext->streams[i];
        auto codecParams = stream->codecpar;

        if(codecParams->codec_type == AVMEDIA_TYPE_AUDIO) { audioStreamIndex = i; audioDecoder = avcodec_find_decoder(codecParams->codec_id); audioCodecParams = codecParams; }
        if(codecParams->codec_type == AVMEDIA_TYPE_VIDEO) { videoStreamIndex = i; videoDecoder = avcodec_find_decoder(codecParams->codec_id); videoCodecParams = codecParams; }

        if(audioStreamIndex != -1 && videoStreamIndex != -1) break;
    }

    if(audioStreamIndex == -1) { qDebug() << "failed to find audio stream inside file"; return false; }
    if(videoStreamIndex == -1) { qDebug() << "failed to find video stream inside file"; return false; }

    auto configureCodecContext = [=](AVCodecContext*& context, AVCodec* decoder, AVCodecParameters* params, AVFrame*& frame, QString type)
    {
        context = avcodec_alloc_context3(decoder);
        if(!context) { qDebug() << "failed to create" << type << "decoder context!"; return false; }

        if(avcodec_parameters_to_context(context, params) < 0) { qDebug() << "can't initialize input" << type << "decoder context"; return false; }

        if(avcodec_open2(context, decoder, NULL) < 0) { qDebug() << "can't open" << type << "decoder"; return false; }

        frame = av_frame_alloc();
        if(!frame) { qDebug() << "can't allocate" << type << "frame"; return false; }

        return true;
    };

    if(!configureCodecContext(videoDecoderContext, videoDecoder, videoCodecParams, videoInputFrame, "video")) return false;
    if(!configureCodecContext(audioDecoderContext, audioDecoder, audioCodecParams, audioInputFrame, "audio")) return false;

    audioDecoderContext->channel_layout = AV_CH_LAYOUT_STEREO;
    audioInputFrame->channel_layout = audioDecoderContext->channel_layout;

    inputPacket = av_packet_alloc();
    if(!inputPacket) { qDebug() << "can't allocate input packet!";  return false; }

    //first frame, needed fo initialization
    if(!fillFrame()) { qDebug() << "Failed to fill frame on init!"; return false; };

    width = videoDecoderContext->width;
    height = videoDecoderContext->height;

    innerScaleContext = sws_getContext(width, height, videoDecoderContext->pix_fmt,
                                       width, height, AV_PIX_FMT_RGB0,
                                       SWS_FAST_BILINEAR,
                                       NULL,
                                       NULL,
                                       NULL);

    outerScaleContext = sws_getContext(width, height, videoDecoderContext->pix_fmt,
                                       width, height, AV_PIX_FMT_YUV420P,
                                       SWS_FAST_BILINEAR,
                                       NULL,
                                       NULL,
                                       NULL);


    if(!innerScaleContext) { qDebug() << "failed to initialize scaler context"; return false; }

    return true;
}

bool VideoReader::configOutput(const char *filename)
{
    avformat_alloc_output_context2(&outputFormatContext, NULL, NULL, filename);
    if(!outputFormatContext) { qDebug() << "failed to create output context"; return false; }

    AVOutputFormat* outputFormat = outputFormatContext->oformat;

    auto prepareOutputContext = [=](AVCodecContext*& encoderContext,
                                    std::function<void (AVCodecContext*, AVCodec*)> configureContextFunc,
                                    std::function<void (AVFrame*)> configureFrameFunc,
                                    AVCodecID codecId, AVFrame*& frame, AVPacket*& packet, QString type)
    {
        auto stream = avformat_new_stream(outputFormatContext, NULL);
        if(!stream) { qDebug() << "failed to allocate output" << type << "stream"; return false; }

        AVCodec* encoder = avcodec_find_encoder(codecId);
        if(!encoder) { qDebug() << "failed to find" << type << "encoder!"; return false; }

        encoderContext = avcodec_alloc_context3(encoder);
        if(!encoderContext) { qDebug() << "failed to create video encoder context!"; return false; }

        configureContextFunc(encoderContext, encoder);

        int result = avcodec_open2(encoderContext, encoder, NULL);
        if(result < 0) { qDebug() << "failed to open audio encoder" << av_make_error(result); return false; }
        if(avcodec_parameters_from_context(stream->codecpar, encoderContext) < 0) { qDebug() << "failed to copy parameters to audio output stream"; return false; }

        packet = av_packet_alloc();
        if(!packet) {qDebug() << "failed allocate output" << type << "packet"; return false;}

        frame = av_frame_alloc();
        if(!frame) { qDebug() << "can't allocate output" << type << "frame"; return false; }

        configureFrameFunc(frame);

        av_frame_get_buffer(frame, 0);

        return true;
    };

    auto configureAudioFrame = [=](AVFrame* frame)
    {
        frame->nb_samples = audioEncoderContext->frame_size;
        frame->format = audioEncoderContext->sample_fmt;
        frame->sample_rate = audioEncoderContext->sample_rate;
        frame->channel_layout = av_get_default_channel_layout(audioDecoderContext->channels);
    };

    auto configureAudioEncoderContext = [=](AVCodecContext* encoderContext, AVCodec* encoder)
    {
        encoderContext->bit_rate = 64000;
        encoderContext->sample_fmt = encoder->sample_fmts[0];
        encoderContext->sample_rate = 44100;
        encoderContext->codec_type = AVMEDIA_TYPE_AUDIO;
        encoderContext->channel_layout = AV_CH_LAYOUT_STEREO;
        encoderContext->channels = av_get_channel_layout_nb_channels(encoderContext->channel_layout);
    };

    auto configureVideoFrame = [=](AVFrame* frame)
    {
        frame->format = videoEncoderContext->pix_fmt;
        frame->width  = videoEncoderContext->width;
        frame->height = videoEncoderContext->height;
    };

    auto configureVideoEncoderContext = [=](AVCodecContext* encoderContext, AVCodec* encoder)
    {
        encoderContext->width = videoDecoderContext->width;
        encoderContext->height = videoDecoderContext->height;
        encoderContext->pix_fmt = encoder->pix_fmts[0];
        encoderContext->gop_size = 10;
        encoderContext->max_b_frames = 1;
        encoderContext->framerate = AVRational{30, 1};
        encoderContext->time_base = AVRational{1, 30};

        av_opt_set(encoderContext->priv_data, "preset", "ultrafast", 0);
        av_opt_set(encoderContext->priv_data, "tune", "zerolatency", 0);
    };

    if(!prepareOutputContext(videoEncoderContext, configureVideoEncoderContext, configureVideoFrame, outputFormat->video_codec, videoOutputFrame, videoOutputPacket, "video")) return false;
    if(!prepareOutputContext(audioEncoderContext, configureAudioEncoderContext, configureAudioFrame, outputFormat->audio_codec, audioOutputFrame, audioOutputPacket, "audio")) return false;

    if(outputFormat->flags & AVFMT_GLOBALHEADER) outputFormat->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;

    int result = 0;
    if(!(outputFormat->flags & AVFMT_NOFILE))
        if((result = avio_open(&outputFormatContext->pb, filename, AVIO_FLAG_WRITE)) < 0)
            { qDebug() << "failed to open file" <<  av_make_error(result); return false; }

    result = avformat_write_header(outputFormatContext, NULL);
    if(result < 0) {qDebug() << "failed to write header!" << av_make_error(result); return false; }

    return true;
}

bool VideoReader::configResampler()
{

    resampleContext = swr_alloc_set_opts(NULL,
                                         av_get_default_channel_layout(audioEncoderContext->channels),
                                         audioEncoderContext->sample_fmt,
                                         audioEncoderContext->sample_rate,
                                         av_get_default_channel_layout(audioDecoderContext->channels),
                                         audioDecoderContext->sample_fmt,
                                         audioDecoderContext->sample_rate,
                                         0, NULL);
    if (!resampleContext) { qDebug() << "Could not allocate resample context"; return false; }

    int error;
    if ((error = swr_init(resampleContext)) < 0) { qDebug() << "Could not open resample context"; swr_free(&resampleContext); return false; }

    return true;
}

bool VideoReader::encode(AVFrame* frame, AVCodecContext* encoderContext, AVPacket* outputPacket, int streamIndex, QString type)
{
    int response;

    response = avcodec_send_frame(encoderContext, frame);
    if(response < 0) { qDebug() << "failed to send" << type << "frame" << av_make_error(response); return false; }

    while(response >= 0)
    {
        response = avcodec_receive_packet(encoderContext, outputPacket);
        if(response == AVERROR(EAGAIN) || response == AVERROR_EOF) { av_packet_unref(outputPacket); continue; }
        else if (response < 0) { qDebug() << "failed to encode" << type << "frame!" << response << av_make_error(response); return false; }

        outputPacket->stream_index = streamIndex;

        AVStream *inStream = inputFormatContext->streams[streamIndex];
        AVStream *outStream = outputFormatContext->streams[streamIndex];

        av_packet_rescale_ts(outputPacket, inStream->time_base, outStream->time_base);

        if((response = av_interleaved_write_frame(outputFormatContext, outputPacket)) != 0) { qDebug() << "Failed to write" << type << "packet!" <<  av_make_error(response); av_packet_unref(outputPacket); return false; }

        av_packet_unref(outputPacket);
    }

    return true;
}

如果需要，我可以试着写下更短的例子

Answer 1

据我所知，有几种情况swr_convert_frame可能什么都不写：

您没有正确初始化输出帧。如果是这样，请检查以下代码段：

  audioFrame = av_frame_alloc();
  if (audioFrame == NULL) {
    // error handling
  }
  audioFrame->format = /* the sample format you'd like to use */; 
  audioFrame->channel_layout = audioCodecContext->channel_layout;
  audioFrame->nb_samples = audioCodecContext->frame_size;
  if (av_frame_get_buffer(encoder->audioFrame, 0) < 0) {
    // error handling
  }

输入帧中的样本不足以生成完整的输出帧。如果是这样，您需要 swr_get_delay.

if (swr_convert(swrContext, audioFrame->data,
                audioFrame->nb_samples,
                (uint8_t const**)frame->data, frame->nb_samples) < 0) {
  // handle error
}
// do stuff with your audioFrame
...

while (swr_get_delay(swrContext, audioCodecContext->sample_rate)
       > audioFrame->nb_samples) {
  if (swr_convert(swrContext, audioFrame->data,
                  audioFrame->nb_samples, NULL, 0) < 0) {
    // handle error
  }
  // do stuff with your audioFrame
}

不管怎样，多提供一些信息，至少要提供一个最小的可重复样本，以供进一步诊断。

我不得不承认 libav 的文档太差了，它曾经让我抓狂。但是诅咒 libav 的作者不会有任何帮助，而且，开源贡献者不欠你任何东西。

使用 FFMPEG LibAV 重采样音频

Resampling audio with FFMPEG LibAV

c++

ffmpeg

libav