Executing text-to-speech in order

765 Views Asked by At

I want to synthesize text. I have an array of sentences and array of pauses, that I wish between these sentences.

What was the thought Synthesize -> start the timer, timer fires after provided time -> Synthesize -> start the timer -> Synt...

By chance, I've noticed that timer fires the lesser time first, instead of executing and setting up timers in sequence. The loop doesn't wait till synthesizer finished to pronounce, it continues to run.

How to work out that synthesizer pronounces sentences with provided pauses, and in order?

import SwiftUI

struct KingsSpeechView: View {
    @ObservedObject var speaker = Speaker()
    @State private var subtitles = ""

    @State private var currentStepIndex = 0

    let kingsSpeech = [
        "Hello. Let's start the Game! Let the hunger Games Begin...Whoa-Whoa. Here're are the rules on the screen.",
        "Okey, now that you know the rules, chill out. Let's play another game.",
        "You say Hi, I say Ho.",
        "Hooo",
        "Hooo"
     ]
     var pauses = [0.0, 20.0, 90.0, 40.0, 40.0]
     // try to change into this
     // var pauses = [0.0, 20.0, 10.0, 5.0, 5.0]
     // the sequence of execution is completely different
     // the ones that has less value, will execute first
     // While I expected it to execute in order it is in array, instead it runs as it runs (wants)
     // (or maybe it's the case it's just one timer for all)
     // How to prevent loop from continuing to new iteration until the speech is not pronounced?

    var body: some View {
        VStack {
            Text(subtitles)
                .padding(.bottom, 50)
                .padding(.horizontal, 20)
        
        
            Button("Play") {
                playSound()
            }
        }
    }

    func playSound() {

        for step in 0..<kingsSpeech.count {
            let timer = Timer.scheduledTimer(withTimeInterval: pauses[step], repeats: false) { timer in

                subtitles = kingsSpeech[step]
                speaker.speak("\(kingsSpeech[step])")
                print("I am out")
                currentStepIndex += 1


                // I've tried to stop a loop from moving on, before the speech had finished to pronounce 
                // with some sort of a condition maybe; by index or by identifying if the synthesizer is speaking
                // but it even turned out that timer executes completely different, look in time arrays above
                // while speaker.semaphoreIndex == step {
                //     print("still waiting")
                // }
                // while speaker.synth.isSpeaking {
                //
                // }

            }
        }
    }
}

...

import AVFoundation
import Combine

class Speaker: NSObject, ObservableObject, AVSpeechSynthesizerDelegate {
    let synth = AVSpeechSynthesizer()
    // started to try something with simophore, but didn't understand how to implement it
    var semaphore = DispatchSemaphore(value: 0)
    var semaphoreIndex = 0
    

    override init() {
        super.init()
        synth.delegate = self
    }

    func speak(_ string: String) {
        let utterance = AVSpeechUtterance(string: string)
        utterance.voice = AVSpeechSynthesisVoice(language: "en-GB")
        utterance.rate = 0.4
        synth.speak(utterance)
    }
    
}

extension Speaker {
    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        print("all done")
        semaphore.signal()
        semaphoreIndex += 1
    }
}
2

There are 2 best solutions below

1
On BEST ANSWER

Just speak an utterance, receive the delegate method, and in that method wait the desired interval and go on to the next utterance and interval.

Here's a complete example. It uses a Cocoa project, not SwiftUI, but you can easily adapt it.

import UIKit
import AVFoundation

func delay(_ delay:Double, closure:@escaping ()->()) {
    let when = DispatchTime.now() + delay
    DispatchQueue.main.asyncAfter(deadline: when, execute: closure)
}

class Speaker : NSObject, AVSpeechSynthesizerDelegate {
    var synth : AVSpeechSynthesizer!
    var sentences = [String]()
    var intervals = [Double]()
    func start(_ sentences: [String], _ intervals: [Double]) {
        self.sentences = sentences
        self.intervals = intervals
        self.synth = AVSpeechSynthesizer()
        synth.delegate = self
        self.sayOne()
    }
    func sayOne() {
        if let sentence = sentences.first {
            sentences.removeFirst()
            let utter = AVSpeechUtterance(string: sentence)
            self.synth.speak(utter)
        }
    }
    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        if let interval = intervals.first {
            intervals.removeFirst()
            delay(interval) {
                self.sayOne()
            }
        }
    }
}

class ViewController: UIViewController {
    let speaker = Speaker()
    override func viewDidLoad() {
        super.viewDidLoad()
        let sentences = [
            "I will speak again in one second",
            "I will speak again in five seconds",
            "I will speak again in one second",
            "Done"]
        let intervals = [1.0, 5.0, 1.0]
        self.speaker.start(sentences, intervals)
    }
}

0
On

Trying to answer the question that I asked in comments to solution: For now, it can play/ pause

TODO: Now I have to discover how to jump backward/ forward between sentences. So, for this I firstly need to stop the current speech task. speaker.synth.stopSpeaking(at: .word)

Then, I maybe should have some index tracking what's the current stage is. Then, when I stopped the task, I remember the index. And I can go backward/ forward. Now start from index-1 or index+1 place, rather than from the beginning.

  @State private var isPlaying = false
      ...
        // play button
        Button(action: {
            
            if isPlaying {
                isPlaying.toggle()
                speaker.synth.pauseSpeaking(at: .word)
            } else {
                isPlaying.toggle()
                // continue playing here if it was paused before, else ignite speech utterance
                if speaker.synth.isPaused {
                    speaker.synth.continueSpeaking()
                } else {
                    speaker.speak()
                }
                
            }
        }, label: {
            Image(systemName: (isPlaying ? "pause.fill" : "play.fill"))
                .resizable()
                .scaledToFit()
                .frame(width: 50, height: 50)
            
        })