I am trying to implement real time live speech recognition. My approach is two make two functions which run concurrently :
- Read audio using pyaudio
- Use Whisper model for inference
However, the inference time taken by the model is very long and the flow of program is really weird (maybe because I am newbie). Please help
Here is summary of my code :
feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2")
FRAMES_PER_BUFFER = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECOND = 1
p = pyaudio.PyAudio()
recordings = Queue()
record = True
async def record_microphone():
global record
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
input_device_index=0,
frames_per_buffer=FRAMES_PER_BUFFER)
frames = []
try:
while record:
try:
print("Recording Audio")
data = stream.read(FRAMES_PER_BUFFER,exception_on_overflow=False)
frames.append(np.frombuffer(data, np.int16))
print(np.frombuffer(data, np.int16))
if len(frames) >= (RATE * RECORD_SECOND) / FRAMES_PER_BUFFER:
recordings.put(frames.copy())
frames = []
await asyncio.sleep(0.1)
except OSError as e:
print(f"OSError: {e}")
continue
except KeyboardInterrupt:
record = False
finally:
if stream.is_active():
stream.stop_stream()
stream.close()
p.terminate()
async def speech_recognition():
while record:
print("Transcripting")
frames = recordings.get()
print(frames)
if len(frames) != 0:
print("Preprocessing1")
logMelRepresentation = feature_extractor(frames, return_tensors="pt")['input_features'][0]
print("Preprocessing2")
input_tensor = logMelRepresentation.unsqueeze(0)
print("Preprocessing3")
output_ids = model.generate(input_tensor, decoder_input_ids=decoder_input_ids)[0]
print("Preprocessing4")
decoded_text = tokenizer.decode(output_ids, skip_special_tokens=True)
print("Decoded Text")
print(decoded_text)
#transcribe_done.set()
async def main():
await asyncio.gather(record_microphone(), speech_recognition())
if __name__ == "__main__":
asyncio.run(main())
What is printed in the terminal :
Recording Audio
[ 0 0 -146 ... -2118 -1885 -1747]
Recording Audio
[-1446 -996 -544 ... 385 347 267]
Recording Audio
[ 200 155 96 ... 3609 4063 4451]
Recording Audio
[ 4756 5034 5237 ... -1066 -955 -850]
Transcripting
[array([ 0, 0, -146, ..., -2118, -1885, -1747], dtype=int16), array([-1446, -996, -544, ..., 385, 347, 267], dtype=int16), array([ 200, 155, 96, ..., 3609, 4063, 4451], dtype=int16), array([ 4756, 5034, 5237, ..., -1066, -955, -850], dtype=int16)]
Preprocessing1
Preprocessing2
Preprocessing3
# Take very long time
Preprocessing4
Decoded Text
Transcripting
# Never continue any task, doing nothing