Transcoding to vorbis using FFmpeg libraries, C++

3.9k Views Asked by At

I have made a test application to transcode to vorbis format (webm container).

So far, based on FFmpeg examples, things are somewhat working, and output file plays properly, but sound in right channel is missing. I tried looking at different possibilities, but so far could not find any answer.

For reference, this is the code I am using:

#include "stdafx.h"
#define MAX_AUDIO_PACKET_SIZE (128 * 1024)

#include <iostream>
#include <fstream>

#include <string>
#include <vector>
#include <map>

#include <deque>
#include <queue>

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>

extern "C"
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswscale/swscale.h"
#include "libavutil/dict.h"
#include "libavutil/error.h"
#include "libavutil/opt.h"
#include <libavutil/fifo.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
AVCodecID           outputAudioFormat = AV_CODEC_ID_VORBIS;

static int sws_flags = SWS_BICUBIC;
#define STREAM_DURATION   50.0
#define STREAM_FRAME_RATE 25 /* 25 images/s */
#define STREAM_PIX_FMT    AV_PIX_FMT_YUV420P /* default pix_fmt */

AVFormatContext*    fmt_ctx= NULL;
int                    audio_stream_index = -1;
AVCodecContext *    codec_ctx_audio = NULL;
AVCodec*            codec_audio = NULL;
AVFrame*            decoded_frame = NULL;
uint8_t**            audio_dst_data = NULL;
int                    got_frame = 0;
int                    audiobufsize = 0;
AVPacket            input_packet;
int                    audio_dst_linesize = 0;
int                    audio_dst_bufsize = 0;
SwrContext *        swrContext = NULL;

AVOutputFormat *    output_format = NULL ;
AVFormatContext *    output_fmt_ctx= NULL;
AVStream *            audio_st = NULL;
AVStream*           video_st = NULL;
AVCodec *            audio_codec = NULL;
AVCodec*            video_codec = NULL;
double                audio_pts = 0.0;
AVFrame *            out_frame = avcodec_alloc_frame();

int                    audio_input_frame_size = 64;

uint8_t *            audio_data_buf = NULL;
uint8_t *            audio_out = NULL;
int                    audio_bit_rate;
int                    audio_sample_rate;
int                    audio_channels;
int                 sourceSampleRate=0;
int                 destSampleRate = 0;

int                 dst_nb_samples = 0;
int                 pivotIndex = 0;
int                 max_dst_nb_samples = 0;
int                 samples_count=0;

int decode_packet();
int open_audio_input(char* src_filename);
int decode_frame();

int open_encoder(char* output_filename);
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
    enum AVCodecID codec_id);
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);
void close_audio(AVFormatContext *oc, AVStream *st);
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize);

static AVFrame *frame;
static AVPicture src_picture, dst_picture;
static int frame_count;
/* Add an output stream. */
static AVStream *add_stream(AVFormatContext *oc, AVCodec **codec,
                            enum AVCodecID codec_id)
    AVCodecContext *c;
    AVStream *st;

    /* find the encoder */
    *codec = avcodec_find_encoder(codec_id);
    if (!(*codec)) {
        fprintf(stderr, "Could not find encoder for '%s'\n",

    st = avformat_new_stream(oc, *codec);
    if (!st) {
        fprintf(stderr, "Could not allocate stream\n");
    st->id = oc->nb_streams-1;
    c = st->codec;

    switch ((*codec)->type) {
        c->sample_fmt  = (*codec)->sample_fmts ?
            (*codec)->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
        c->bit_rate    = 64000;
        c->sample_rate = 44100;
        c->channels    = 2;

        c->codec_id = codec_id;

        c->bit_rate = 400000;
        /* Resolution must be a multiple of two. */
        c->width    = 352;
        c->height   = 288;
        /* timebase: This is the fundamental unit of time (in seconds) in terms
         * of which frame timestamps are represented. For fixed-fps content,
         * timebase should be 1/framerate and timestamp increments should be
         * identical to 1. */
        c->time_base.den = STREAM_FRAME_RATE;
        c->time_base.num = 1;
        c->gop_size      = 12; /* emit one intra frame every twelve frames at most */
        c->pix_fmt       = STREAM_PIX_FMT;
        if (c->codec_id == AV_CODEC_ID_MPEG2VIDEO) {
            /* just for testing, we also add B frames */
            c->max_b_frames = 2;
        if (c->codec_id == AV_CODEC_ID_MPEG1VIDEO) {
            /* Needed to avoid using macroblocks in which some coeffs overflow.
             * This does not happen with normal video, it just happens here as
             * the motion of the chroma plane does not match the luma plane. */
            c->mb_decision = 2;


    /* Some formats want stream headers to be separate. */
    if (oc->oformat->flags & AVFMT_GLOBALHEADER)
        c->flags |= CODEC_FLAG_GLOBAL_HEADER;

    return st;

static void open_video(AVFormatContext *oc, AVCodec *codec, AVStream *st)
    int ret;
    AVCodecContext *c = st->codec;

    /* open the codec */
    ret = avcodec_open2(c, codec, NULL);
    if (ret < 0) {
        //fprintf(stderr, "Could not open video codec: %s\n", av_err2str(ret));

    /* allocate and init a re-usable frame */
    frame = av_frame_alloc();
    if (!frame) {
        fprintf(stderr, "Could not allocate video frame\n");
    frame->format = c->pix_fmt;
    frame->width = c->width;
    frame->height = c->height;

    /* Allocate the encoded raw picture. */
    ret = avpicture_alloc(&dst_picture, c->pix_fmt, c->width, c->height);
    if (ret < 0) {
        //fprintf(stderr, "Could not allocate picture: %s\n", av_err2str(ret));

    /* If the output format is not YUV420P, then a temporary YUV420P
     * picture is needed too. It is then converted to the required
     * output format. */
    if (c->pix_fmt != AV_PIX_FMT_YUV420P) {
        ret = avpicture_alloc(&src_picture, AV_PIX_FMT_YUV420P, c->width, c->height);
        if (ret < 0) {
            //fprintf(stderr, "Could not allocate temporary picture: %s\n",
            //        av_err2str(ret));

    /* copy data and linesize picture pointers to frame */
    *((AVPicture *)frame) = dst_picture;

int open_audio_input(char* src_filename)
    int i =0;
    /* open input file, and allocate format context */
    if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
        fprintf(stderr, "Could not open source file %s\n", src_filename);

    // Retrieve stream information
    if(avformat_find_stream_info(fmt_ctx, NULL)<0)
        return -1; // Couldn't find stream information

    // Dump information about file onto standard error
    av_dump_format(fmt_ctx, 0, src_filename, 0);

    // Find the first video stream
    for(i=0; i<fmt_ctx->nb_streams; i++)
    if ( audio_stream_index != -1 )
        // Get a pointer to the codec context for the audio stream

        // Find the decoder for the video stream
        if(codec_audio==NULL) {
            fprintf(stderr, "Unsupported audio codec!\n");
            return -1; // Codec not found

        // Open codec
        AVDictionary *codecDictOptions = NULL;
        if(avcodec_open2(codec_ctx_audio, codec_audio, &codecDictOptions)<0)
            return -1; // Could not open codec

        // Set up SWR context once you've got codec information
        swrContext = swr_alloc();
        av_opt_set_int(swrContext, "in_channel_layout",  codec_ctx_audio->channel_layout, 0);
        av_opt_set_int(swrContext, "out_channel_layout", codec_ctx_audio->channel_layout,  0);
        av_opt_set_int(swrContext, "in_sample_rate",     codec_ctx_audio->sample_rate, 0);
        av_opt_set_int(swrContext, "out_sample_rate",    codec_ctx_audio->sample_rate, 0);
        av_opt_set_sample_fmt(swrContext, "in_sample_fmt",  codec_ctx_audio->sample_fmt, 0);
        if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
            av_opt_set_sample_fmt(swrContext, "out_sample_fmt", AV_SAMPLE_FMT_FLTP,  0);
            av_opt_set_sample_fmt(swrContext, "out_sample_fmt", AV_SAMPLE_FMT_S16,  0);
        int rv = swr_init(swrContext);

        sourceSampleRate    =   destSampleRate = codec_ctx_audio->sample_rate;

        // Allocate audio frame
        if ( decoded_frame == NULL ) decoded_frame = avcodec_alloc_frame();
        int nb_planes = 0;
        AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];
        nb_planes = av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ? codec_ctx_audio->channels : 1;
        int tempSize =  sizeof(uint8_t *) * nb_planes;
        audio_dst_data = (uint8_t**)av_mallocz(tempSize);
        if (!audio_dst_data)
            fprintf(stderr, "Could not allocate audio data buffers\n");
            for ( int i = 0 ; i < nb_planes ; i ++ )
                audio_dst_data[i] = NULL;

int decode_frame()
    int rv = 0;
    got_frame = 0;
    if ( fmt_ctx == NULL  )
        return rv;
    int ret = 0;
    audiobufsize = 0;
    rv = av_read_frame(fmt_ctx, &input_packet);
    if ( rv < 0 )
        return rv;
    rv = decode_packet();
    // Free the input_packet that was allocated by av_read_frame
    return rv;

int decode_packet()
    int rv = 0;
    int ret = 0;

    //audio stream?
    if(input_packet.stream_index == audio_stream_index)
        while( input_packet.size > 0 )
            int result = avcodec_decode_audio4(codec_ctx_audio, decoded_frame, &got_frame, &input_packet);
            if ( result < 0)
                fprintf(stderr, "Error decoding audio frame\n");
                //return ret;
                if ( got_frame )
                    dst_nb_samples = (int)av_rescale_rnd(swr_get_delay(swrContext, sourceSampleRate) + decoded_frame->nb_samples, sourceSampleRate, destSampleRate, AV_ROUND_UP);
                    if ( dst_nb_samples > max_dst_nb_samples )
                        max_dst_nb_samples = dst_nb_samples;
                        if ( audio_dst_data[0] )
                            audio_dst_data[0] = NULL;
                    if ( audio_dst_data[0] == NULL )
                        if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
                            ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, codec_ctx_audio->channels,
                                decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 0);
                            ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, codec_ctx_audio->channels,
                                decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_S16, 0);
                    /* TODO: extend return code of the av_samples_* functions so that this call is not needed */
                    int resampled  = swr_convert(swrContext, audio_dst_data, out_frame->nb_samples,
                        (const uint8_t **)(decoded_frame->extended_data), decoded_frame->nb_samples);
                    char str[900]="";
                    sprintf(str,"out_frame->nb_samples:\t%d; decoded_frame->nb_samples:\t%d",out_frame->nb_samples,decoded_frame->nb_samples );
                    if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
                        audio_dst_bufsize  = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 1);
                        audio_dst_bufsize  = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_S16, 1);

                    input_packet.size -= result;
           += result;
                    input_packet.size   =   0;
             =   NULL;
    return rv;

int open_encoder(char* output_filename )
    int rv = 0;

    /* allocate the output media context */
    AVOutputFormat *opfmt = NULL;

    avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL, output_filename);
    if (!output_fmt_ctx) {
        printf("Could not deduce output format from file extension: using MPEG.\n");
        avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg", output_filename);
    if (!output_fmt_ctx) {
        rv = -1;
        output_format = output_fmt_ctx->oformat;

    /* Add the audio stream using the default format codecs
    * and initialize the codecs. */
    audio_st = NULL;

    if ( output_fmt_ctx )
        if (output_format->audio_codec != AV_CODEC_ID_NONE)
            audio_st = add_audio_stream(output_fmt_ctx, &audio_codec, output_format->audio_codec);

        /* Now that all the parameters are set, we can open the audio and
        * video codecs and allocate the necessary encode buffers. */

        if (audio_st)
            rv = open_audio(output_fmt_ctx, audio_codec, audio_st);
            if ( rv < 0 ) return rv;

        av_dump_format(output_fmt_ctx, 0, output_filename, 1);
        /* open the output file, if needed */
        if (!(output_format->flags & AVFMT_NOFILE))
            if (avio_open(&output_fmt_ctx->pb, output_filename, AVIO_FLAG_WRITE) < 0) {
                fprintf(stderr, "Could not open '%s'\n", output_filename);
                rv = -1;
                /* Write the stream header, if any. */
                if (avformat_write_header(output_fmt_ctx, NULL) < 0)
                    fprintf(stderr, "Error occurred when opening output file\n");
                    rv = -1;

    return rv;

AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
    enum AVCodecID codec_id)
    AVCodecContext *c;
    AVStream *st;

    /* find the audio encoder */
    *codec = avcodec_find_encoder(codec_id);
    if (!(*codec)) {
        fprintf(stderr, "Could not find codec\n");

    st = avformat_new_stream(oc, *codec);
    if (!st) {
        fprintf(stderr, "Could not allocate stream\n");
    st->id = 1;

    c = st->codec;

    /* put sample parameters */
    if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
        c->sample_fmt  = AV_SAMPLE_FMT_FLTP;
        c->sample_fmt  = AV_SAMPLE_FMT_S16;

    c->bit_rate    = audio_bit_rate;
    c->sample_rate = audio_sample_rate;
    c->channels    = audio_channels;

    // some formats want stream headers to be separate
    if (oc->oformat->flags & AVFMT_GLOBALHEADER)
        c->flags |= CODEC_FLAG_GLOBAL_HEADER;

    return st;

int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)
    int ret=0;
    AVCodecContext *c;

    st->duration = fmt_ctx->duration;
    c = st->codec;

    /* open it */
    ret = avcodec_open2(c, codec, NULL) ;
    if ( ret < 0)
        fprintf(stderr, "could not open codec\n");
        return -1;

    if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)
        audio_input_frame_size = 10000;
        audio_input_frame_size = c->frame_size;
    out_frame->nb_samples = audio_input_frame_size;
    int tempSize = audio_input_frame_size *
        av_get_bytes_per_sample(c->sample_fmt) *
    return ret;

void close_audio(AVFormatContext *oc, AVStream *st)

void write_audio_frame(uint8_t ** audio_dst_data, int audio_dst_bufsize)
    AVFormatContext *oc = output_fmt_ctx;
    AVStream *st = audio_st;
    if ( oc == NULL || st == NULL ) return;
    AVCodecContext *c;
    AVPacket pkt = { 0 }; // data and size must be 0;
    int got_packet=0, ret=0;

    c = st->codec;

    out_frame->nb_samples = audio_input_frame_size;

    AVRational r;
    r.num = 1;
    r.den = c->sample_rate;
    out_frame->pts = av_rescale_q(samples_count, (AVRational)r, c->time_base);
    avcodec_fill_audio_frame(out_frame, c->channels, c->sample_fmt,
                             audio_dst_data[0], audio_dst_bufsize, 0);
    samples_count += out_frame->nb_samples;

    ret = avcodec_encode_audio2(c, &pkt, out_frame, &got_packet);
    if (ret < 0) 

    if (!got_packet)

    /* rescale output packet timestamp values from codec to stream timebase */
    pkt.pts = av_rescale_q_rnd(pkt.pts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
    pkt.dts = av_rescale_q_rnd(pkt.dts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
    pkt.duration = av_rescale_q(pkt.duration, c->time_base, st->time_base);
    pkt.stream_index = st->index;

    char str[999]="";
    /* Write the compressed frame to the media file. */
    ret = av_interleaved_write_frame(oc, &pkt);
    if (ret != 0) 

void write_delayed_frames(AVFormatContext *oc, AVStream *st)
    AVCodecContext *c = st->codec;
    int got_output = 0;
    int ret = 0;
    AVPacket pkt; = NULL;
    pkt.size = 0;
    int i = 0;
    for (got_output = 1; got_output; i++)
        ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);
        if (ret < 0)
            fprintf(stderr, "error encoding frame\n");
        static int64_t tempPts = 0;
        static int64_t tempDts = 0;
        /* If size is zero, it means the image was buffered. */
        if (got_output)
            pkt.pts = av_rescale_q_rnd(pkt.pts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
            pkt.dts = av_rescale_q_rnd(pkt.dts, c->time_base, st->time_base, (AVRounding )(AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX));
            pkt.duration = av_rescale_q(pkt.duration, c->time_base, st->time_base);
            pkt.stream_index = st->index;
            if ( c && c->coded_frame && c->coded_frame->key_frame)
                pkt.flags |= AV_PKT_FLAG_KEY;
            /* Write the compressed frame to the media file. */
            ret = av_interleaved_write_frame(oc, &pkt);
            ret = 0;

int main(int argc, char **argv)
    /* register all formats and codecs */
    int i =0;
    int ret=0;
    char src_filename[90] = "test.mp2";
    char dst_filename[90] = "output.webm";
    outputAudioFormat = AV_CODEC_ID_VORBIS;
    if ( codec_ctx_audio->bit_rate == 0 ) codec_ctx_audio->bit_rate = 112000;
    audio_bit_rate        = codec_ctx_audio->bit_rate;
    audio_sample_rate    = codec_ctx_audio->sample_rate;
    audio_channels        = codec_ctx_audio->channels;
    open_encoder( dst_filename );
    int frames= 0;
        int rv = decode_frame();
        if ( rv < 0 )

        if (audio_st)
            audio_pts = audio_st->pts.val * av_q2d(audio_st->time_base);
            audio_pts = 0.0;
        if ( codec_ctx_audio )
            if ( got_frame )
                write_audio_frame( audio_dst_data, audio_dst_bufsize );
        printf("\naudio_pts: %f", audio_pts);
        dst_nb_samples = (int)av_rescale_rnd(swr_get_delay(swrContext, sourceSampleRate) + decoded_frame->nb_samples, sourceSampleRate, destSampleRate, AV_ROUND_UP);
        if ( dst_nb_samples > max_dst_nb_samples )
            max_dst_nb_samples = dst_nb_samples;
            if ( audio_dst_data[0] )
                audio_dst_data[0] = NULL;
        if ( audio_dst_data[0] == NULL )
            if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
                ret = av_samples_alloc(audio_dst_data, NULL, codec_ctx_audio->channels,
                    decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 0);
                ret = av_samples_alloc(audio_dst_data, NULL, codec_ctx_audio->channels,
                    decoded_frame->nb_samples, (AVSampleFormat)AV_SAMPLE_FMT_S16, 0);
        int resampled = swr_convert(swrContext, audio_dst_data, out_frame->nb_samples,NULL, 0);
        if ( outputAudioFormat == AV_CODEC_ID_VORBIS )
            audio_dst_bufsize  = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_FLTP, 1);
            audio_dst_bufsize  = av_samples_get_buffer_size(&audio_dst_linesize, decoded_frame->channels, resampled, (AVSampleFormat)AV_SAMPLE_FMT_S16, 1);
        if ( audio_dst_bufsize <= 0 ) break;
        audio_pts = audio_st->pts.val * av_q2d(audio_st->time_base);
        printf("\naudio_pts: %f", audio_pts);
        write_audio_frame( audio_dst_data, audio_dst_bufsize );
    write_delayed_frames( output_fmt_ctx, audio_st );
    close_audio( output_fmt_ctx, audio_st);
    return 0;

Working under Windows 7, Zeranoe FFmpeg 32 bit build:

libavutil      52. 62.100 / 52. 62.100
libavcodec     55. 47.101 / 55. 47.101
libavformat    55. 22.103 / 55. 22.103
libavdevice    55.  5.102 / 55.  5.102
libavfilter     4.  1.100 /  4.  1.100
libswscale      2.  5.101 /  2.  5.101
libswresample   0. 17.104 /  0. 17.104
libpostproc    52.  3.100 / 52.  3.100

Could anyone point to the place where I might be misunderstanding things?

Thanks for any guidance in advance!


There are 2 best solutions below


I think I finally found the solution. Resampling sample that comes with FFmpeg (with at least the one I have) could be misleading - probably needs to be corrected. Even according to documentation of swr_convert, audio_dst_data can be a big buffer to avoid buffering:

 * If more input is provided than output space then the input will be buffered.
 * You can avoid this buffering by providing more output space than input.
 * Convertion will run directly without copying whenever possible.

This statement could be incorrect (theoretically and in working has no obvious errors, but sometimes results in awkward behavior as I have discovered).

My solution: do not let audio_dst_data buffer size exceed output codec's frame size - then it works perfectly.

Maybe someone would fix swresample library, or resampling example, or, at least document it more clearly.


Most likely the resampler isn't initialized or used correctly. Could you change it the way I'm using it here: ?