I've successfully used SFSpeechRecognizer to transcribe small audio files. But I want to transcribe speech to text from 30-minutes audio-file. I've come up with this solution, but it doesn't work at all.
I get the error:
[Utility] +[AFAggregator logDictationFailedWithError:] Error Domain=kAFAssistantErrorDomain Code=203 "Corrupt" UserInfo={NSLocalizedDescription=Corrupt, NSUnderlyingError=0x28315ca80 {Error Domain=SiriSpeechErrorDomain Code=102 "(null)"}}
Recognition error: Corrupt
The code:
let url = Bundle.main.url(forResource: "1-2-3", withExtension: "m4a")!
let transcriber = AudioTranscriber()
transcriber.transcribeAudio(at: url)
import Speech
import AVFoundation
class AudioTranscriber {
private let speechRecognizer: SFSpeechRecognizer?
private let audioEngine = AVAudioEngine()
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
init(locale: Locale? = nil) {
speechRecognizer = SFSpeechRecognizer(locale: locale ?? Locale(identifier: "uk"))
}
func transcribeAudio(at url: URL) {
if let recognitionTask = recognitionTask {
recognitionTask.cancel()
self.recognitionTask = nil
}
guard let audioFile = try? AVAudioFile(forReading: url) else {
print("Failed to open audio file")
return
}
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let recognitionRequest = recognitionRequest else {
print("Unable to create recognition request")
return
}
recognitionTask = speechRecognizer?.recognitionTask(with: recognitionRequest) { result, error in
if let error = error {
print("Recognition error: \(error.localizedDescription)")
return
}
if let result = result {
let transcription = result.bestTranscription.formattedString
print("Transcription: \(transcription)")
}
}
recognitionRequest.shouldReportPartialResults = true
let audioPlayerNode = AVAudioPlayerNode()
audioEngine.attach(audioPlayerNode)
let format = audioEngine.inputNode.outputFormat(forBus: 0)
audioEngine.inputNode.removeTap(onBus: 0)
audioEngine.inputNode.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, time in
self.recognitionRequest?.append(buffer)
}
audioEngine.connect(audioPlayerNode, to: audioEngine.mainMixerNode, format: audioFile.processingFormat)
let bufferSize: AVAudioFrameCount = 1024
let buffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: bufferSize)!
do {
try audioFile.read(into: buffer)
} catch {
print("Error reading audio file: \(error.localizedDescription)")
return
}
audioPlayerNode.scheduleBuffer(buffer) {
print("audioPlayerNode.scheduleBuffer(buffer) completion()")
self.audioEngine.stop()
self.recognitionRequest?.endAudio()
}
audioEngine.prepare()
do {
try audioEngine.start()
} catch {
print("Audio engine failed to start: \(error.localizedDescription)")
return
}
audioPlayerNode.play()
}
}