转码后的视频流无法在 QuickTime 播放器中播放

Transcoded video stream unplayable in QuickTime player

目前我正在编写使用 ffmpeg 库对媒体文件进行转码的软件。问题是在 H264 QuickTime 的情况下无法播放结果流并显示黑屏。音频流按预期工作。我读到 QuickTime 只能处理 yuv420p 像素格式,对于编码视频也是如此。

我查看了 ffmpeg 示例和 ffmpeg 源代码,但没有找到任何可能存在问题的线索。如果有任何帮助,我将不胜感激。

我唯一设法从 QuickTime 得到的是 SeqAndPicParamSetFromCFDictionaryRef, bad config record 控制台中的消息。 AVPlayer 从 AVFoundation 记录了同样的事情。

这里是输出流和编码器的初始化。

int status;

// avformat_alloc_output_context2()
if ((status = formatContext.open(destFilename)) < 0) {
    return status;
}

AVDictionary *fmtOptions = nullptr;
av_dict_set(&fmtOptions, "movflags", "faststart", 0);
av_dict_set(&fmtOptions, "brand", "mp42", 0);

streams.resize(input->getStreamsCount());
for (int i = 0; i < input->getStreamsCount(); ++i) {
    AVStream *inputStream = input->getStreamAtIndex(i);
    CodecContext &decoderContext = input->getDecoderAtIndex(i);

    // retrieve output codec by codec id
    auto encoderCodecId = decoderContext.getCodecID();;
    if (decoderContext.getCodecType() == AVMEDIA_TYPE_VIDEO || decoderContext.getCodecType() == AVMEDIA_TYPE_AUDIO) {
        int codecIdKey = decoderContext.getCodecType() == AVMEDIA_TYPE_AUDIO ? IPROC_KEY_INT(TargetAudioCodecID) : IPROC_KEY_INT(TargetVideoCodecID);
        auto codecIdParam = static_cast<AVCodecID>(params[codecIdKey]);
        if (codecIdParam != AV_CODEC_ID_NONE) {
            encoderCodecId = codecIdParam;
        }
    }
    AVCodec *encoder = nullptr;
    if ((encoder = avcodec_find_encoder(encoderCodecId)) == nullptr) {
        status = AVERROR_ENCODER_NOT_FOUND;
        return status;
    }

    // create stream with specific codec and format
    AVStream *outputStream = nullptr;
    // avformat_new_stream()
    if ((outputStream = formatContext.newStream(encoder)) == nullptr) {
        return AVERROR(ENOMEM);
    }


    CodecContext encoderContext;
    // avcodec_alloc_context3()
    if ((status = encoderContext.init(encoder)) < 0) {
        return status;
    }

    outputStream->disposition = inputStream->disposition;
    encoderContext.getRawCtx()->chroma_sample_location = decoderContext.getRawCtx()->chroma_sample_location;

    if (encoderContext.getCodecType() == AVMEDIA_TYPE_VIDEO) {
        auto lang = av_dict_get(input->getStreamAtIndex(i)->metadata, "language", nullptr, 0);
        if (lang) {
            av_dict_set(&outputStream->metadata, "language", lang->value, 0);
        }

        // prepare encoder context
        int targetWidth = params[IPROC_KEY_INT(TargetVideoWidth)];
        int targetHeight = params[IPROC_KEY_INT(TargetVideHeight)];



        encoderContext.width() = targetWidth > 0 ? targetWidth : decoderContext.width();
        encoderContext.height() = targetHeight > 0 ? targetHeight : decoderContext.height();
        encoderContext.pixelFormat() = encoder->pix_fmts ? encoder->pix_fmts[0] : decoderContext.pixelFormat();;
        encoderContext.timeBase() = decoderContext.timeBase();
        encoderContext.getRawCtx()->level = 31;
        encoderContext.getRawCtx()->gop_size = 25;

        double far = static_cast<double>(encoderContext.getRawCtx()->width) / encoderContext.getRawCtx()->height;
        double dar = static_cast<double>(decoderContext.width()) / decoderContext.height();
        encoderContext.sampleAspectRatio() = av_d2q(dar / far, 255);


        encoderContext.getRawCtx()->bits_per_raw_sample = FFMIN(decoderContext.getRawCtx()->bits_per_raw_sample,
                                                                av_pix_fmt_desc_get(encoderContext.pixelFormat())->comp[0].depth);
        encoderContext.getRawCtx()->framerate = inputStream->r_frame_rate;
        outputStream->avg_frame_rate = encoderContext.getRawCtx()->framerate;

        VideoFilterGraphParameters params;
        params.height = encoderContext.height();
        params.width = encoderContext.width();
        params.pixelFormat = encoderContext.pixelFormat();
        if ((status = generateGraph(decoderContext, encoderContext, params, streams[i].filterGraph)) < 0) {
            return status;
        }

    } else if (encoderContext.getCodecType() == AVMEDIA_TYPE_AUDIO) {
        auto lang = av_dict_get(input->getStreamAtIndex(i)->metadata, "language", nullptr, 0);
        if (lang) {
            av_dict_set(&outputStream->metadata, "language", lang->value, 0);
        }

        encoderContext.sampleRate() = params[IPROC_KEY_INT(TargetAudioSampleRate)] ? : decoderContext.sampleRate();
        encoderContext.channels() = params[IPROC_KEY_INT(TargetAudioChannels)] ? : decoderContext.channels();
        auto paramChannelLayout = params[IPROC_KEY_INT(TargetAudioChannelLayout)];
        if (paramChannelLayout) {
            encoderContext.channelLayout() = paramChannelLayout;
        } else {
            encoderContext.channelLayout() = av_get_default_channel_layout(encoderContext.channels());
        }

        AVSampleFormat sampleFormatParam = static_cast<AVSampleFormat>(params[IPROC_KEY_INT(TargetAudioSampleFormat)]);
        if (sampleFormatParam != AV_SAMPLE_FMT_NONE) {
            encoderContext.sampleFormat() = sampleFormatParam;
        } else if (encoder->sample_fmts) {
            encoderContext.sampleFormat() = encoder->sample_fmts[0];
        } else {
            encoderContext.sampleFormat() = decoderContext.sampleFormat();
        }

        encoderContext.timeBase().num = 1;
        encoderContext.timeBase().den = encoderContext.sampleRate();

        AudioFilterGraphParameters params;
        params.channelLayout = encoderContext.channelLayout();
        params.channels = encoderContext.channels();
        params.format = encoderContext.sampleFormat();
        params.sampleRate = encoderContext.sampleRate();
        if ((status = generateGraph(decoderContext, encoderContext, params, streams[i].filterGraph)) < 0) {
            return status;
        }
    }

    // before using encoder, we should open it and update its parameters
    printf("Codec bits per sample %d\n", av_get_bits_per_sample(encoderCodecId));
    AVDictionary *options = nullptr;
    // avcodec_open2()
    if ((status = encoderContext.open(encoder, &options)) < 0) {
        return status;
    }
    if (streams[i].filterGraph) {
        streams[i].filterGraph.setOutputFrameSize(encoderContext.getFrameSize());
    }
    // avcodec_parameters_from_context()
    if ((status = encoderContext.fillParamters(outputStream->codecpar)) < 0) {
        return status;
    }
    outputStream->codecpar->format = encoderContext.getRawCtx()->pix_fmt;

    if (formatContext.getRawCtx()->oformat->flags & AVFMT_GLOBALHEADER) {
        encoderContext.getRawCtx()->flags |= CODEC_FLAG_GLOBAL_HEADER;
    }

    if (encoderContext.getRawCtx()->nb_coded_side_data) {
        int i;

        for (i = 0; i < encoderContext.getRawCtx()->nb_coded_side_data; i++) {
            const AVPacketSideData *sd_src = &encoderContext.getRawCtx()->coded_side_data[i];
            uint8_t *dst_data;

            dst_data = av_stream_new_side_data(outputStream, sd_src->type, sd_src->size);
            if (!dst_data)
                return AVERROR(ENOMEM);
            memcpy(dst_data, sd_src->data, sd_src->size);
        }
    }

    /*
     * Add global input side data. For now this is naive, and copies it
     * from the input stream's global side data. All side data should
     * really be funneled over AVFrame and libavfilter, then added back to
     * packet side data, and then potentially using the first packet for
     * global side data.
     */
    for (int i = 0; i < inputStream->nb_side_data; i++) {
        AVPacketSideData *sd = &inputStream->side_data[i];
        uint8_t *dst = av_stream_new_side_data(outputStream, sd->type, sd->size);
        if (!dst)
            return AVERROR(ENOMEM);
        memcpy(dst, sd->data, sd->size);
    }

    // copy timebase while removing common factors
    if (outputStream->time_base.num <= 0 || outputStream->time_base.den <= 0) {
        outputStream->time_base = av_add_q(encoderContext.timeBase(), (AVRational){0, 1});
    }

    // copy estimated duration as a hint to the muxer
    if (outputStream->duration <= 0 && inputStream->duration > 0) {
        outputStream->duration = av_rescale_q(inputStream->duration, inputStream->time_base, outputStream->time_base);
    }

    streams[i].codecType = encoderContext.getRawCtx()->codec_type;
    streams[i].codec = std::move(encoderContext);
    streams[i].streamIndex = i;
}

// avio_open() and avformat_write_header()
if ((status = formatContext.writeHeader(fmtOptions)) < 0) {
    return status;
}

formatContext.dumpFormat();

正在从流中读取。

int InputProcessor::performStep() {
    int status;

    Packet nextPacket;
    if ((status = input->getFormatContext().readFrame(nextPacket)) < 0) {
        return status;
    }
    ++streams[nextPacket.getStreamIndex()].readPackets;
    int streamIndex = nextPacket.getStreamIndex();
    CodecContext &decoder = input->getDecoderAtIndex(streamIndex);
    AVStream *inputStream = input->getStreamAtIndex(streamIndex);

    if (streams[nextPacket.getStreamIndex()].readPackets == 1) {
        for (int i = 0; i < inputStream->nb_side_data; ++i) {
            AVPacketSideData *src_sd = &inputStream->side_data[i];
            uint8_t *dst_data;

            if (src_sd->type == AV_PKT_DATA_DISPLAYMATRIX) {
                continue;
            }
            if (av_packet_get_side_data(nextPacket.getRawPtr(), src_sd->type, nullptr)) {
                continue;
            }
            dst_data = av_packet_new_side_data(nextPacket.getRawPtr(), src_sd->type, src_sd->size);
            if (!dst_data) {
                return AVERROR(ENOMEM);
            }
            memcpy(dst_data, src_sd->data, src_sd->size);
        }
    }

    nextPacket.rescaleTimestamps(inputStream->time_base, decoder.timeBase());

    status = decodePacket(&nextPacket, nextPacket.getStreamIndex());
    if (status < 0 && status != AVERROR(EAGAIN)) {
        return status;
    }
    return 0;
}

这是 decoding/encoding 代码。

int InputProcessor::decodePacket(Packet *packet, int streamIndex) {
    int status;
    int sendStatus;

    auto &decoder = input->getDecoderAtIndex(streamIndex);

    do {
        if (packet == nullptr) {

            sendStatus = decoder.flushDecodedFrames();
        } else {
            sendStatus = decoder.sendPacket(*packet);
        }

        if (sendStatus < 0 && sendStatus != AVERROR(EAGAIN) && sendStatus != AVERROR_EOF) {
            return sendStatus;
        }
        if (sendStatus == 0 && packet) {
            ++streams[streamIndex].decodedPackets;
        }

        Frame decodedFrame;
        while (true) {
            if ((status = decoder.receiveFrame(decodedFrame)) < 0) {
                break;
            }
            ++streams[streamIndex].decodedFrames;
            if ((status = filterAndWriteFrame(&decodedFrame, streamIndex)) < 0) {
                break;
            }
            decodedFrame.unref();
        }
    } while (sendStatus == AVERROR(EAGAIN));

 return status;
}

int InputProcessor::encodeAndWriteFrame(Frame *frame, int streamIndex) {
    assert(input->isValid());
    assert(formatContext);

    int status = 0;
    int sendStatus;

    Packet packet;

    CodecContext &encoderContext = streams[streamIndex].codec;

    do {
        if (frame) {
            sendStatus = encoderContext.sendFrame(*frame);
        } else {
            sendStatus = encoderContext.flushEncodedPackets();
        }
        if (sendStatus < 0 && sendStatus != AVERROR(EAGAIN) && sendStatus != AVERROR_EOF) {
            return status;
        }
        if (sendStatus == 0 && frame) {
            ++streams[streamIndex].encodedFrames;
        }

        while (true) {
            if ((status = encoderContext.receivePacket(packet)) < 0) {
                break;
            }
            ++streams[streamIndex].encodedPackets;
            packet.setStreamIndex(streamIndex);
            auto sourceTimebase = encoderContext.timeBase();
            auto dstTimebase = formatContext.getStreams()[streamIndex]->time_base;
            packet.rescaleTimestamps(sourceTimebase, dstTimebase);
            if ((status = formatContext.writeFrameInterleaved(packet)) < 0) {
                return status;
            }
            packet.unref();
        }
    } while (sendStatus == AVERROR(EAGAIN));

    if (status != AVERROR(EAGAIN)) {
        return status;
    }

    return 0;
}

原始视频的 FFprobe 输出。

Input #0, matroska,webm, from 'testvideo':
  Metadata:
    title           : TestVideo
    encoder         : libebml v1.3.0 + libmatroska v1.4.0
    creation_time   : 2014-12-23T03:38:05.000000Z
  Duration: 00:02:29.25, start: 0.000000, bitrate: 79549 kb/s
    Stream #0:0(rus): Video: h264 (High 4:4:4 Predictive), yuv444p10le(pc, bt709, progressive), 2048x858 [SAR 1:1 DAR 1024:429], 24 fps, 24 tbr, 1k tbn, 48 tbc (default)
    Stream #0:1(rus): Audio: pcm_s24le, 48000 Hz, 6 channels, s32 (24 bit), 6912 kb/s (default)

已转码:

Input #0, mov,mp4,m4a,3gp,3g2,mj2, from '123.mp4':
  Metadata:
    major_brand     : mp42
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    encoder         : Lavf57.71.100
  Duration: 00:02:29.27, start: 0.000000, bitrate: 4282 kb/s
    Stream #0:0(rus): Video: h264 (High) (avc1 / 0x31637661), yuv420p, 1280x720 [SAR 192:143 DAR 1024:429], 3940 kb/s, 24.01 fps, 24 tbr, 12288 tbn, 96 tbc (default)
    Metadata:
      handler_name    : VideoHandler
    Stream #0:1(rus): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, 5.1, fltp, 336 kb/s (default)
    Metadata:
      handler_name    : SoundHandler

问题出在初始化编码器时步骤顺序错误。在 transcoding.c 示例中,他们在调用 avcodec_open2() 后将 CODEC_FLAG_GLOBAL_HEADER 分配给 AVCodecContext.flags 属性。我认为这是正确的,并在我的代码中做了同样的事情。它导致 extradata 字段未初始化,QuickTime 无法解析结果流。在打开编解码器之前设置标志解决了问题。

结果代码:

        // should be placed before avcodec_open2
        if (formatContext.getRawCtx()->oformat->flags & AVFMT_GLOBALHEADER) {
            encoderContext.getRawCtx()->flags |= CODEC_FLAG_GLOBAL_HEADER;
        }

        // before using encoder, we should open it and update its parameters
        printf("Codec bits per sample %d\n", av_get_bits_per_sample(encoderCodecId));
        AVDictionary *options = nullptr;
        if ((status = encoderContext.open(encoder, &options)) < 0) {
            return status;
        }