How to implemented concurrency in two function with while loop in Speech Recognition?

36 Views Asked by At

I am trying to implement real time live speech recognition. My approach is two make two functions which run concurrently :

  1. Read audio using pyaudio
  2. Use Whisper model for inference

However, the inference time taken by the model is very long and the flow of program is really weird (maybe because I am newbie). Please help

Here is summary of my code :

feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2")

FRAMES_PER_BUFFER = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECOND = 1

p = pyaudio.PyAudio()

recordings = Queue()
record = True

async def record_microphone():
     global record
     p = pyaudio.PyAudio()

     stream = p.open(format=FORMAT,
                         channels=CHANNELS,
                         rate=RATE,
                         input=True,
                         input_device_index=0,
                         frames_per_buffer=FRAMES_PER_BUFFER)

     frames = []

     try:
          while record:
               try:
                    print("Recording Audio")
                    data = stream.read(FRAMES_PER_BUFFER,exception_on_overflow=False)
                    frames.append(np.frombuffer(data, np.int16))
                    print(np.frombuffer(data, np.int16))

                    if len(frames) >= (RATE * RECORD_SECOND) / FRAMES_PER_BUFFER:
                         recordings.put(frames.copy())
                         frames = []

                         await asyncio.sleep(0.1)

               except OSError as e:
                    print(f"OSError: {e}")
                    continue

     except KeyboardInterrupt:
          record = False

     finally:
          if stream.is_active():
               stream.stop_stream()
               stream.close()
          p.terminate()

async def speech_recognition():
     while record:
          print("Transcripting")
          frames = recordings.get()
          print(frames)
          if len(frames) != 0:
               print("Preprocessing1")
               logMelRepresentation = feature_extractor(frames, return_tensors="pt")['input_features'][0]
               print("Preprocessing2")
               input_tensor = logMelRepresentation.unsqueeze(0)
               print("Preprocessing3")
               output_ids = model.generate(input_tensor, decoder_input_ids=decoder_input_ids)[0]
               print("Preprocessing4")
               decoded_text = tokenizer.decode(output_ids, skip_special_tokens=True)
               print("Decoded Text")
               print(decoded_text) 
               #transcribe_done.set()

async def main():
    await asyncio.gather(record_microphone(), speech_recognition())

if __name__ == "__main__":
    asyncio.run(main())


What is printed in the terminal :

Recording Audio
[    0     0  -146 ... -2118 -1885 -1747]
Recording Audio
[-1446  -996  -544 ...   385   347   267]
Recording Audio
[ 200  155   96 ... 3609 4063 4451]
Recording Audio
[ 4756  5034  5237 ... -1066  -955  -850]
Transcripting
[array([    0,     0,  -146, ..., -2118, -1885, -1747], dtype=int16), array([-1446,  -996,  -544, ...,   385,   347,   267], dtype=int16), array([ 200,  155,   96, ..., 3609, 4063, 4451], dtype=int16), array([ 4756,  5034,  5237, ..., -1066,  -955,  -850], dtype=int16)]
Preprocessing1
Preprocessing2
Preprocessing3

# Take very long time

Preprocessing4
Decoded Text
 
Transcripting

# Never continue any task, doing nothing
0

There are 0 best solutions below