Start and end time of MoviePy's VideoClip not working

16 Views Asked by At

I'm trying to add captions to a video. The desired outcome is to show each word in the exact moment is being said.

I have a method that gives me the accurate time start and end per each word:

def get_words_per_time(audio_speech_file):
    model = whisper.load_model("base")
    transcribe = model.transcribe(
        audio=audio_speech_file, fp16=False, word_timestamps=True
    )
    segments = transcribe["segments"]
    words = []

    for seg in segments:
        for word in seg["words"]:
            words.append(
                {
                    "word": word["word"],
                    "start": word["start"],
                    "end": word["end"],
                    "prob": round(word["probability"], 4),
                }
            )
    return words

Then I have a code that uses MoviePy to create TextClip and assing a given start and end time per pair of words (I know there are redundant statements, srry):

def generate_captions(
    words,
    font="Komika",
    fontsize=32,
    color="White",
    align="center",
    stroke_width=3,
    stroke_color="black",
):
    text_comp = []
    for i in track(range(0, len(words), 2), description="Creating captions..."):
        word1 = words[i]
        if i + 1 < len(words):
            word2 = words[i + 1]
        text_clip = TextClip(
            f"{word1['word']} {word2['word'] if i + 1 < len(words) else ''}",
            font=font,  # Change Font if not found
            fontsize=fontsize,
            color=color,
            align=align,
            method="caption",
            size=(660, None),
            stroke_width=stroke_width,
            stroke_color=stroke_color,
        )
        text_clip = text_clip.set_start(word1["start"])
        text_clip = text_clip.set_end(
            word2["end"] if i + 1 < len(words) else word1["end"]
        )
        text_comp.append(text_clip)
    return text_comp

Finally, I concatenate the words into a single video:

vid_clip = CompositeVideoClip(
    [vid_clip, concatenate_videoclips(text_comp).set_position(("center", 860))]
)

The output is this, but you can clearly see the words are not flowing with the speech. They somehow move faster as if the start/end time did not matter. Here's the video

The words with their respective start/end time, look like this:

[
    {
        'word': 'This',
        'start': 0.0,
        'end': 0.22,
        'prob': 0.805
    },
    {
        'word': 'is',
        'start': 0.22,
        'end': 0.42,
        'prob': 0.9991
    },
    {
        'word': 'a',
        'start': 0.42,
        'end': 0.6,
        'prob': 0.999
    },
    {
        'word': 'test,
        ',
        'start': 0.6,
        'end': 1.04,
        'prob': 0.9939
    },
    {
        'word': 'to',
        'start': 1.18,
        'end': 1.3,
        'prob': 0.9847
    },
    {
        'word': 'show',
        'start': 1.3,
        'end': 1.54,
        'prob': 0.9971
    },
    {
        'word': 'words',
        'start': 1.54,
        'end': 1.9,
        'prob': 0.995
    },
    {
        'word': 'does',
        'start': 1.9,
        'end': 2.16,
        'prob': 0.997
    },
    {
        'word': 'not',
        'start': 2.16,
        'end': 2.4,
        'prob': 0.9978
    },
    {
        'word': 'appear.',
        'start': 2.4,
        'end': 2.82,
        'prob': 0.9984
    },
    {
        'word': 'At',
        'start': 3.46,
        'end': 3.6,
        'prob': 0.9793
    },
    {
        'word': 'their',
        'start': 3.6,
        'end': 3.8,
        'prob': 0.9984
    },
    {
        'word': 'proper',
        'start': 3.8,
        'end': 4.22,
        'prob': 0.9976
    },
    {
        'word': 'time.',
        'start': 4.22,
        'end': 4.72,
        'prob': 0.999
    },
    {
        'word': 'Thanks',
        'start': 5.04,
        'end': 5.4,
        'prob': 0.9662
    },
    {
        'word': 'for,
        ',
        'start': 5.4,
        'end': 5.66,
        'prob': 0.9941
    },
    {
        'word': 'watching.',
        'start': 5.94,
        'end': 6.36,
        'prob': 0.7701
    }
]

What could be causing this?

1

There are 1 best solutions below

0
ernesto casco velazquez On

The quick solution I found was just to give each word a duration of end - start in seconds. Then, for every speech pause, add a TextClip with one letter and fontSize=1 so it is not visible. This way, the captions appear at the time they should. I know it is not the most orthodox solution, but it works.

def generate_captions(
    words,
    font="Komika",
    fontsize=32,
    color="White",
    align="center",
    stroke_width=3,
    stroke_color="black",
):
    text_comp = []
    prev_end = 0

    blank_space = TextClip(
        "i",
        font=font,  # Change Font if not found
        fontsize=1,
        color="black",
        align="East",
        method="caption",
        size=(660, None),
        stroke_width=0,
    )
    for i in track(range(0, len(words), 2), description="Creating captions..."):
        word1 = words[i]
        word2 = word1
        if i + 1 < len(words):
            word2 = words[i + 1]
        text_clip = TextClip(
            f"{word1['word']} {word2['word'] if i + 1 < len(words) else ''}",
            font=font,  # Change Font if not found
            fontsize=fontsize,
            color=color,
            align=align,
            method="caption",
            size=(660, None),
            stroke_width=stroke_width,
            stroke_color=stroke_color,
        )
        text_clip = text_clip.set_duration(word2["end"] - word1["start"])
        if prev_end != word1["start"]:
            blank_space = blank_space.set_duration(word1["start"] - prev_end)
            text_comp.append(blank_space)
        prev_end = word2["end"]
        text_comp.append(text_clip)
    return text_comp