Swift -AVMutableVideoCompositionLayerInstruction Misalignment when Merging Videos

Question

Swift -AVMutableVideoCompositionLayerInstruction Misalignment when Merging Videos

616 Views Asked by Lance Samaria At 19 August 2025 at 05:29

I followed the Ray Wenderlich to merge videos. The finished result is 1 merged video where portrait videos are at the top of the screen and landscape videos are at the bottom of the screen. In the image below the portrait videos plays first and then landscape video plays after it. The landscape video is from the Photos Library.

code:

var arrOfAssets = [AVAsset]()

func mergVideos() {

    let mixComposition = AVMutableComposition()
            
    let videoCompositionTrack = mixComposition.addMutableTrack(withMediaType: .video, preferredTrackID: Int32(kCMPersistentTrackID_Invalid))
    let audioCompositionTrack = mixComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid))
    
    var count = 0
    var insertTime = CMTime.zero
    var instructions = [AVMutableVideoCompositionInstruction]()
    
    for videoAsset in arrOfAssets {

        let audioTrack = videoAsset.tracks(withMediaType: .audio)[0]

        do {
    
            try videoCompositionTrack?.insertTimeRange(CMTimeRangeMake(start: .zero, duration: videoAsset.duration), of: videoAsset.tracks(withMediaType: .video)[0], at: insertTime)
            try audioCompositionTrack?.insertTimeRange(CMTimeRangeMake(start: .zero, duration: videoAsset.duration), of: audioTrack, at: insertTime)
    
            let layerInstruction = videoCompositionLayerInstruction(videoCompositionTrack!, asset: videoAsset, count: count)
    
            let videoCompositionInstruction = AVMutableVideoCompositionInstruction()
            videoCompositionInstruction.timeRange = CMTimeRangeMake(start: insertTime, duration: videoAsset.duration)
            videoCompositionInstruction.layerInstructions = [layerInstruction]

            instructions.append(videoCompositionInstruction)
    
            insertTime = CMTimeAdd(insertTime, videoAsset.duration)

            count += 1

        } catch { }
    }
    
    let videoComposition = AVMutableVideoComposition()
    videoComposition.instructions = instructions
    videoComposition.frameDuration = CMTimeMake(value: 1, timescale: 30)
    videoComposition.renderSize = CGSize(width: UIScreen.main.bounds.width, height: UIScreen.main.bounds.height)

    // ...
    exporter.videoComposition = videoComposition
}

Ray Wenderlich Code:

func videoCompositionLayerInstruction(_ track: AVCompositionTrack, asset: AVAsset, count: Int) -> AVMutableVideoCompositionLayerInstruction {
    
    let instruction = AVMutableVideoCompositionLayerInstruction(assetTrack: track)
    
    let assetTrack = asset.tracks(withMediaType: .video)[0]
    
    let transform = assetTrack.preferredTransform
    let assetInfo = orientationFromTransform(transform)
    
    var scaleToFitRatio = UIScreen.main.bounds.width / assetTrack.naturalSize.width
    if assetInfo.isPortrait {
        
        scaleToFitRatio = UIScreen.main.bounds.width / assetTrack.naturalSize.height
        let scaleFactor = CGAffineTransform(scaleX: scaleToFitRatio, y: scaleToFitRatio)
        instruction.setTransform(assetTrack.preferredTransform.concatenating(scaleFactor), at: .zero)
        
    } else {
        
        let scaleFactor = CGAffineTransform(scaleX: scaleToFitRatio, y: scaleToFitRatio)
        var concat = assetTrack.preferredTransform.concatenating(scaleFactor)
            .concatenating(CGAffineTransform(translationX: 0,y: UIScreen.main.bounds.width / 2))
        if assetInfo.orientation == .down {
            let fixUpsideDown = CGAffineTransform(rotationAngle: CGFloat(Double.pi))
            let windowBounds = UIScreen.main.bounds
            let yFix = assetTrack.naturalSize.height + windowBounds.height
            let centerFix = CGAffineTransform(translationX: assetTrack.naturalSize.width, y: yFix)
            concat = fixUpsideDown.concatenating(centerFix).concatenating(scaleFactor)
        }
        instruction.setTransform(concat, at: .zero)
    }
    
    if count == 0 {
        instruction.setOpacity(0.0, at: asset.duration)
    }
    
    return instruction
}

func orientationFromTransform(_ transform: CGAffineTransform) -> (orientation: UIImage.Orientation, isPortrait: Bool) {
    var assetOrientation = UIImage.Orientation.up
      var isPortrait = false
      let tfA = transform.a
      let tfB = transform.b
      let tfC = transform.c
      let tfD = transform.d

      if tfA == 0 && tfB == 1.0 && tfC == -1.0 && tfD == 0 {
        assetOrientation = .right
        isPortrait = true
      } else if tfA == 0 && tfB == -1.0 && tfC == 1.0 && tfD == 0 {
        assetOrientation = .left
        isPortrait = true
      } else if tfA == 1.0 && tfB == 0 && tfC == 0 && tfD == 1.0 {
        assetOrientation = .up
      } else if tfA == -1.0 && tfB == 0 && tfC == 0 && tfD == -1.0 {
        assetOrientation = .down
      }
      return (assetOrientation, isPortrait)
}

I also followed the code from this Medium post. It sets the render size to a default of let renderSize = CGSize(width: 1280.0, height: 720.0) as opposed the Ray's which uses the entire screen.

The 1280/720 results are the portrait videos are centered correctly but with the landscape videos the sound plays however the videos are no where on screen. I didn't add a picture of landscape because it's just a black screen.

Original Q&A

There are 1 best solutions below

**Lance Samaria** · Answer 1

I got it working for both portrait and landscape.

I tested this answer with videos recorded in portrait, landscape left/right, upside down, front camera, and the back camera. I haven't had any issues. I'm far from a CGAffineTransform expert, so if anyone has a better answer please post it.

Ray Wenderlich's merging code works, but it doesn't work for videos with different orientations. I used this answer to check the properties of the preferredTransform for the orientation check.

I also used a deleted answer for the portrait orientation part and a downvoted answer for the landscape orientation part. The downvoted answer led me to a his GitHub where the landscape code was incorrect but close enough for me to make adjustments to it to get it working correctly.

One thing to point out is the comments from @DonMag told me about the benefit of using 720x1280. The code below will merge all of the videos together with a renderSize of 720x1280 which will keep them the same size.

code:

var arrOfAssets = [AVAsset]()

// class property
let renderSize = CGSize(width: 720, height: 1280) // for higher quality use CGSize(width: 1080, height: 1920)

func mergVideos() {

    let mixComposition = AVMutableComposition()
            
    let videoCompositionTrack = mixComposition.addMutableTrack(withMediaType: .video, preferredTrackID: Int32(kCMPersistentTrackID_Invalid))
    let audioCompositionTrack = mixComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid))
    
    var count = 0
    var insertTime = CMTime.zero
    var instructions = [AVMutableVideoCompositionInstruction]()
    
    for videoAsset in arrOfAssets {

        guard let firstTrack = videoAsset.tracks.first, let _ = videoAsset.tracks(withMediaType: .video).first else { continue }

        do {
    
            try videoCompositionTrack?.insertTimeRange(CMTimeRangeMake(start: .zero, duration: videoAsset.duration), of: videoAsset.tracks(withMediaType: .video)[0], at: insertTime)

            if let audioTrack = videoAsset.tracks(withMediaType: .audio).first {
                try audioCompositionTrack?.insertTimeRange(CMTimeRangeMake(start: .zero, duration: videoAsset.duration), of: audioTrack, at: insertTime)
            }

            let layerInstruction = videoCompositionLayerInstruction(firstTrack, asset: videoAsset, count: count)
    
            let videoCompositionInstruction = AVMutableVideoCompositionInstruction()
            videoCompositionInstruction.timeRange = CMTimeRangeMake(start: insertTime, duration: videoAsset.duration)
            videoCompositionInstruction.layerInstructions = [layerInstruction]

            instructions.append(videoCompositionInstruction)
    
            insertTime = CMTimeAdd(insertTime, videoAsset.duration)

            count += 1

        } catch {
            // you should always print out the error
        }
    }
    
    let videoComposition = AVMutableVideoComposition()
    videoComposition.instructions = instructions
    videoComposition.frameDuration = CMTimeMake(value: 1, timescale: 30)
    videoComposition.renderSize = self.renderSize // <--- **** IMPORTANT ****

    // ...
    exporter.videoComposition = videoComposition
}

Most important part of this answer that replaces the RW code:

func videoCompositionLayerInstruction(_ firstTrack: AVAssetTrack, asset: AVAsset, count: Int) -> AVMutableVideoCompositionLayerInstruction {

    let instruction = AVMutableVideoCompositionLayerInstruction(assetTrack: firstTrack)

    let assetTrack = asset.tracks(withMediaType: .video)[0]            
    let t = assetTrack.fixedPreferredTransform // new transform fix 
    let assetInfo = orientationFromTransform(t)

    if assetInfo.isPortrait {

        let scaleToFitRatio = self.renderSize.width / assetTrack.naturalSize.height
        let scaleFactor = CGAffineTransform(scaleX: scaleToFitRatio, y: scaleToFitRatio)
        var finalTransform = assetTrack.fixedPreferredTransform.concatenating(scaleFactor)

        // This was needed in the case of the OP's answer that I used for the portrait part. I haven't tested this but this is what he said: "(if video not taking entire screen and leaving some parts black - don't know when actually needed so you'll have to try and see when it's needed)"
        if assetInfo.orientation == .rightMirrored || assetInfo.orientation == .leftMirrored {
            finalTransform = finalTransform.translatedBy(x: -transform.ty, y: 0)
        }
        instruction.setTransform(finalTransform, at: CMTime.zero)

    } else {

        let renderRect = CGRect(x: 0, y: 0, width: self.renderSize.width, height: self.renderSize.height)
        let videoRect = CGRect(origin: .zero, size: assetTrack.naturalSize).applying(assetTrack.fixedPreferredTransform)

        let scale = renderRect.width / videoRect.width
        let transform = CGAffineTransform(scaleX: renderRect.width / videoRect.width, y: (videoRect.height * scale) / assetTrack.naturalSize.height)
        let translate = CGAffineTransform(translationX: .zero, y: ((self.renderSize.height - (videoRect.height * scale))) / 2)

        instruction.setTransform(assetTrack.fixedPreferredTransform.concatenating(transform).concatenating(translate), at: .zero)
    }

    if count == 0 {
        instruction.setOpacity(0.0, at: asset.duration)
    }
    
    return instruction
}

New orientation check:

func orientationFromTransform(_ transform: CGAffineTransform) -> (orientation: UIImage.Orientation, isPortrait: Bool) {
    var assetOrientation = UIImage.Orientation.up
    var isPortrait = false
    
    if transform.a == 0 && transform.b == 1.0 && transform.c == -1.0 && transform.d == 0 {
        assetOrientation = .right
        isPortrait = true
    } else if transform.a == 0 && transform.b == 1.0 && transform.c == 1.0 && transform.d == 0 {
        assetOrientation = .rightMirrored
        isPortrait = true
    } else if transform.a == 0 && transform.b == -1.0 && transform.c == 1.0 && transform.d == 0 {
        assetOrientation = .left
        isPortrait = true
    } else if transform.a == 0 && transform.b == -1.0 && transform.c == -1.0 && transform.d == 0 {
        assetOrientation = .leftMirrored
        isPortrait = true
    } else if transform.a == 1.0 && transform.b == 0 && transform.c == 0 && transform.d == 1.0 {
        assetOrientation = .up
    } else if transform.a == -1.0 && transform.b == 0 && transform.c == 0 && transform.d == -1.0 {
        assetOrientation = .down
    }
}

preferredTransform fix:

extension AVAssetTrack {
    
    var fixedPreferredTransform: CGAffineTransform {
        var t = preferredTransform
        switch(t.a, t.b, t.c, t.d) {
        case (1, 0, 0, 1):
            t.tx = 0
            t.ty = 0
        case (1, 0, 0, -1):
            t.tx = 0
            t.ty = naturalSize.height
        case (-1, 0, 0, 1):
            t.tx = naturalSize.width
            t.ty = 0
        case (-1, 0, 0, -1):
            t.tx = naturalSize.width
            t.ty = naturalSize.height
        case (0, -1, 1, 0):
            t.tx = 0
            t.ty = naturalSize.width
        case (0, 1, -1, 0):
            t.tx = naturalSize.height
            t.ty = 0
        case (0, 1, 1, 0):
            t.tx = 0
            t.ty = 0
        case (0, -1, -1, 0):
            t.tx = naturalSize.height
            t.ty = naturalSize.width
        default:
            break
        }
        return t
    }
}

Swift -AVMutableVideoCompositionLayerInstruction Misalignment when Merging Videos

There are 1 best solutions below

Related Questions in IOS

Related Questions in SWIFT

Related Questions in AVFOUNDATION

Related Questions in AVMUTABLECOMPOSITION

Related Questions in AVMUTABLEVIDEOCOMPOSITION

Trending Questions

Popular # Hahtags

Popular Questions