Overlay Static Text via AVMutableVideoComposition

In Swift for iOS, I have an array of AVURLAsset. I pass it through a function to stitch/merge the video assets together into one final video. For each video, my goal is to overlay text centered in the frame.

When I play the outputted video, the video assets merge correctly, but I'm unable to understand why none of the text overlays. I tried following an existing answer, but to no avail. I also tried just overlaying text to a single video following Ray Wenderlich's tutorial, but for some reason the same end result :(. Any guidance would be extremely appreciated..

func merge(videos: [AVURLAsset], completion: @escaping (_ url: URL, _ asset: AVAssetExportSession)->()) {
let videoComposition = AVMutableComposition()
var lastTime: CMTime = .zero

var maxVideoSize = CGSize.zero

guard let videoCompositionTrack = videoComposition.addMutableTrack(withMediaType: .video, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)),
      let audioCompositionTrack = videoComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }

let mainComposition = AVMutableVideoComposition()

let mainParentLayer = CALayer()
let mainVideoLayer = CALayer()
mainParentLayer.frame = CGRect(x: 0, y: 0, width: maxVideoSize.width, height: maxVideoSize.height)
mainVideoLayer.frame = CGRect(x: 0, y: 0, width: maxVideoSize.width, height: maxVideoSize.height)


var instructions = [AVMutableVideoCompositionInstruction]()
print("Main Parent Layer Frame: \(mainParentLayer.frame)")

for video in videos {
    if let videoTrack = video.tracks(withMediaType: .video)[safe: 0] {
        videoCompositionTrack.preferredTransform = videoTrack.preferredTransform
        print("Video Track Transform: \(videoTrack.preferredTransform)")

        do {
            try videoCompositionTrack.insertTimeRange(CMTimeRangeMake(start: .zero, duration: video.duration), of: videoTrack, at: lastTime)
            print("Video time range: Start = \(lastTime.seconds), Duration = \(video.duration.seconds) [\(lastTime.seconds + video.duration.seconds > 0 ? "Right" : "Wrong")]")

            if let audioTrack = video.tracks(withMediaType: .audio)[safe: 0] {
                try audioCompositionTrack.insertTimeRange(CMTimeRangeMake(start: .zero, duration: video.duration), of: audioTrack, at: lastTime)

            lastTime = CMTimeAdd(lastTime, video.duration)

            let videoSize = videoTrack.naturalSize.applying(videoTrack.preferredTransform)
            let videoRect = CGRect(x: 0, y: 0, width: abs(videoSize.width), height: abs(videoSize.height))
            maxVideoSize = CGSize(width: max(maxVideoSize.width, videoRect.width), height: max(maxVideoSize.height, videoRect.height))

            let textLayer = CATextLayer()
            textLayer.string = "TESTING TESTING TESTING"
            textLayer.foregroundColor = UIColor.white.cgColor
            textLayer.font = UIFont(name: "Helvetica-Bold", size: min(videoRect.height / 10, 100))
            textLayer.shadowOpacity = 0.5
            textLayer.alignmentMode = .center
            textLayer.contentsScale = UIScreen.main.scale
            textLayer.isWrapped = true

            let textHeight: CGFloat = min(videoRect.height / 10, 120)
            let textWidth: CGFloat = videoRect.width
            let xPos = (videoRect.width - textWidth) / 2
            let yPos = (videoRect.height - textHeight) / 2
            textLayer.frame = CGRect(x: xPos, y: yPos, width: textWidth, height: textHeight)
            textLayer.zPosition = 1
            print("Text Layer Frame: \(textLayer.frame) [\(textLayer.frame.width > 0 && textLayer.frame.height > 0 ? "Right" : "Wrong")]")

            let parentLayer = CALayer()
            parentLayer.backgroundColor = UIColor.green.cgColor // Temp background color for debugging
            parentLayer.frame = videoRect
            print("Video Layer zPosition: \(mainVideoLayer.zPosition), Text Layer zPosition: \(textLayer.zPosition) [\(textLayer.zPosition > mainVideoLayer.zPosition ? "Right" : "Wrong")]")

            let videoCompositionInstruction = AVMutableVideoCompositionInstruction()
            videoCompositionInstruction.timeRange = CMTimeRangeMake(start: lastTime - video.duration, duration: video.duration)
            let layerInstruction = AVMutableVideoCompositionLayerInstruction(assetTrack: videoTrack)
            videoCompositionInstruction.layerInstructions = [layerInstruction]

            parentLayer.zPosition = 0
            print("Parent Layer Frame: \(parentLayer.frame), Background Color: \(parentLayer.backgroundColor.debugDescription)")
            print("Text Layer Frame: \(textLayer.frame)")
        } catch {
            print("Failed to insert track: \(error.localizedDescription)")

mainParentLayer.frame = CGRect(x: 0, y: 0, width: maxVideoSize.width, height: maxVideoSize.height)
mainVideoLayer.frame = mainParentLayer.frame

mainComposition.renderSize = maxVideoSize
mainComposition.instructions = instructions
mainComposition.frameDuration = CMTime(value: 1, timescale: 30)
mainComposition.animationTool = AVVideoCompositionCoreAnimationTool(postProcessingAsVideoLayer: mainVideoLayer, in: mainParentLayer)

print("Final Main Parent Layer Frame: \(mainParentLayer.frame)")
print("Number of Sublayers in Main Parent Layer: \(mainParentLayer.sublayers?.count ?? 0)")

let outputUrl = NSURL.fileURL(withPath: NSTemporaryDirectory() + "merged" + ".mp4")

print("Pre-Export Main Parent Layer Frame: \(mainParentLayer.frame)")
print("Pre-Export Number of Sublayers in Main Parent Layer: \(mainParentLayer.sublayers?.count ?? 0)")
if let sublayers = mainParentLayer.sublayers {
    for (index, layer) in sublayers.enumerated() {
        print("Layer \(index): \(layer), Frame: \(layer.frame), zPosition: \(layer.zPosition)")

guard let exporter = AVAssetExportSession(asset: videoComposition, presetName: AVAssetExportPresetHighestQuality) else { return }

exporter.videoComposition = mainComposition
exporter.outputURL = outputUrl
exporter.outputFileType = .mp4
exporter.shouldOptimizeForNetworkUse = true

if let videoComposition = exporter.videoComposition {
    print("Export Video Composition Render Size: \(videoComposition.renderSize)")
    print("Export Video Composition Frame Duration: \(videoComposition.frameDuration)")
    print("Export Video Composition Instructions Count: \(videoComposition.instructions.count)")

exporter.exportAsynchronously {
    DispatchQueue.main.async {
        if let outputUrl = exporter.outputURL, exporter.status == .completed {
            completion(outputUrl, exporter)
        } else if let error = exporter.error {
            print("Export failed: \(error.localizedDescription)")
play(video: exporter.asset)

In your code, each video segment is associated with a separate parent layer containing both the video and text layers. But the AVVideoCompositionCoreAnimationTool is not correctly configured to composite these layers over the video.

You should create a single main video layer (mainVideoLayer) and add it to a main parent layer (mainParentLayer). Then, add each video's text layer to the mainParentLayer. This ensures a proper hierarchy where the text overlays are correctly positioned over the video content.
And configure the AVVideoCompositionCoreAnimationTool with the mainVideoLayer as the video layer to process and the mainParentLayer as the animation layer.

Make sure that each AVMutableVideoCompositionInstruction correctly spans the entire duration of its corresponding video segment. That should be important for seamless playback and proper overlay of text.

Your merge function would be (relevant extracts only):

func merge(videos: [AVURLAsset], completion: @escaping (_ url: URL, _ asset: AVAssetExportSession)->()) {
    // initial setup code remains the same

    // Create a main parent layer that will host all individual video layers
    let mainParentLayer = CALayer()
    mainParentLayer.frame = CGRect(x: 0, y: 0, width: maxVideoSize.width, height: maxVideoSize.height)
    let mainVideoLayer = CALayer()
    mainVideoLayer.frame = mainParentLayer.frame

    // Add each video layer and its corresponding text layer
    for video in videos {
        // video processing code remains the same

        // Add the video layer and text layer to the main parent layer

    // Set up the main composition
    mainComposition.renderSize = maxVideoSize
    mainComposition.frameDuration = CMTime(value: 1, timescale: 30) // Assuming 30 fps
    mainComposition.animationTool = AVVideoCompositionCoreAnimationTool(postProcessingAsVideoLayer: mainVideoLayer, in: mainParentLayer)

    // export setup code remains the same

I tried following and updating the relevant extracts of code, but the text overlay still doesn't appear in each/any video segment...the merged video remains to play correctly otherwise.
I edited my post with the latest function for reference.

The size and position of the mainParentLayer and mainVideoLayer are set based on maxVideoSize, which is initially zero and updated within the loop. Make sure the frame of mainParentLayer is updated after determining the maximum video size.
The time range for each videoCompositionInstruction is set incorrectly. It should start from the beginning of the last time, not from the last time itself.
You are adding each video's parentLayer to mainParentLayer within the loop. That might be causing an overlap issue or an incorrect layer hierarchy. The order in which layers are added and properties are set is important. Make sure all layer properties are correctly set before they are added to their parent layers.

The preferred transform of the video track is set for each video, but it is important to check if this is properly applied to the video layer within the composition.

Considering all of the above, the updated code might look like this:

func merge(videos: [AVURLAsset], completion: @escaping (_ url: URL, _ asset: AVAssetExportSession)->()) {
    // initial setup code remains the same

    var instructions = [AVMutableVideoCompositionInstruction]()
    for video in videos {
        // video track processing code remains the same

        // Update maxVideoSize calculation here, if necessary

        // Text layer setup remains the same

        let parentLayer = CALayer()
        parentLayer.frame = videoRect

        let videoCompositionInstruction = AVMutableVideoCompositionInstruction()
        videoCompositionInstruction.timeRange = CMTimeRangeMake(start: lastTime - video.duration, duration: video.duration)
        let layerInstruction = AVMutableVideoCompositionLayerInstruction(assetTrack: videoTrack)
        videoCompositionInstruction.layerInstructions = [layerInstruction]

        // rest of the loop remains the same

    // Update the frame of mainParentLayer after the loop
    mainParentLayer.frame = CGRect(x: 0, y: 0, width: maxVideoSize.width, height: maxVideoSize.height)
    mainVideoLayer.frame = mainParentLayer.frame

    mainComposition.instructions = instructions
    mainComposition.renderSize = maxVideoSize
    // rest of the function remains the same

i updated my code based on your latest and still am unable to see text for each/any video segment, while the videos themselves correctly play. :(

Make sure the text layers are not being obscured by other layers. That can be checked by setting different background colors for each layer temporarily. And confirm that the zPosition of the text layers is higher than the video layers.

The properties of CATextLayer such as fontSize and frame must be correctly set to make sure the text is visible and within the bounds of the video frame. Check if the font size is appropriate for the video size. A very large font size might cause the text to appear off-screen.

Review the timeRange set for each videoCompositionInstruction. It should correctly span the duration of each video segment.
Verify that the preferredTransform of each video track is correctly applied. An incorrect transform can lead to unexpected layer positioning.
Make sure mainComposition.renderSize and mainComposition.frameDuration are set appropriately. If these are incorrect, it could affect the visibility and placement of the text layers.

Debugging steps could include adding print statements to log the frame and other properties of the text layers to make sure they are being set as expected.
Try and temporarily simplify the composition to include just one video and its text layer. That can help isolate whether the issue is with individual layers or the composition of multiple layers.
Use a visual debugging tool or set distinct background colors for each layer to visually inspect the layer hierarchy and positioning.
Make sure the settings for AVAssetExportSession are correct and support the video format and composition.