How do I transcribe a big audio file in iOS app?

137 Views Asked by At

I've successfully used SFSpeechRecognizer to transcribe small audio files. But I want to transcribe speech to text from 30-minutes audio-file. I've come up with this solution, but it doesn't work at all.

I get the error:

[Utility] +[AFAggregator logDictationFailedWithError:] Error Domain=kAFAssistantErrorDomain Code=203 "Corrupt" UserInfo={NSLocalizedDescription=Corrupt, NSUnderlyingError=0x28315ca80 {Error Domain=SiriSpeechErrorDomain Code=102 "(null)"}}

Recognition error: Corrupt

The code:

let url = Bundle.main.url(forResource: "1-2-3", withExtension: "m4a")!
let transcriber = AudioTranscriber()
transcriber.transcribeAudio(at: url)
import Speech
import AVFoundation

class AudioTranscriber {
    
    private let speechRecognizer: SFSpeechRecognizer?
    private let audioEngine = AVAudioEngine()
    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
    private var recognitionTask: SFSpeechRecognitionTask?
    
    init(locale: Locale? = nil) {
        speechRecognizer = SFSpeechRecognizer(locale: locale ?? Locale(identifier: "uk"))
    }
    
    func transcribeAudio(at url: URL) {
        if let recognitionTask = recognitionTask {
            recognitionTask.cancel()
            self.recognitionTask = nil
        }
        
        guard let audioFile = try? AVAudioFile(forReading: url) else {
            print("Failed to open audio file")
            return
        }
        
        recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
        guard let recognitionRequest = recognitionRequest else {
            print("Unable to create recognition request")
            return
        }
        
        recognitionTask = speechRecognizer?.recognitionTask(with: recognitionRequest) { result, error in
            if let error = error {
                print("Recognition error: \(error.localizedDescription)")
                return
            }
            
            if let result = result {
                let transcription = result.bestTranscription.formattedString
                print("Transcription: \(transcription)")
            }
        }
        
        recognitionRequest.shouldReportPartialResults = true
        
        let audioPlayerNode = AVAudioPlayerNode()
        audioEngine.attach(audioPlayerNode)
        
        let format = audioEngine.inputNode.outputFormat(forBus: 0)
        
        audioEngine.inputNode.removeTap(onBus: 0)
        
        audioEngine.inputNode.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, time in
            self.recognitionRequest?.append(buffer)
        }
        
        audioEngine.connect(audioPlayerNode, to: audioEngine.mainMixerNode, format: audioFile.processingFormat)
        
        let bufferSize: AVAudioFrameCount = 1024
        let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: bufferSize)!
        
        do {
            try audioFile.read(into: buffer)
        } catch {
            print("Error reading audio file: \(error.localizedDescription)")
            return
        }
        
        audioPlayerNode.scheduleBuffer(buffer) {
            print("audioPlayerNode.scheduleBuffer(buffer) completion()")
            self.audioEngine.stop()
            self.recognitionRequest?.endAudio()
        }
        
        audioEngine.prepare()
        
        do {
            try audioEngine.start()
        } catch {
            print("Audio engine failed to start: \(error.localizedDescription)")
            return
        }
        
        audioPlayerNode.play()
    }
    
}
0

There are 0 best solutions below