I am new to the google speech-to-text api. I followed the tutorial on how to transcribe the audio from a microphone stream and it works fine, of course.
My problem arises when I want to use the 'single_utterance' mode, that is, stopping the streaming when the server has detected the end of the user's speech utterance and expects no additional speech.
The code below, manages to retrieve the command from the user most of the times. Sometimes however, no utterance is recognized and the code got stucked in the main for
self.get_logger().debug('GSTTService Incoming request')
timer_period=30 # seconds
self.timer = self.create_timer(timer_period, self.timer_callback(sResponse))
self.timer.cancel()
if sRequest.data == True:
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
self.get_logger().debug('requests created')
responses = self.client.streaming_recognize(self.streaming_config, requests)
self.get_logger().debug('responses created')
# Now, put the transcription responses to use.
#listen_print_loop(responses)
#sResponse.message = self.__retrieve_text(responses)
#sResponse.success = True
#start_time = time.time()
timer.reset()
self.get_logger().info('start_time')
timeout_seconds = 30
final_transcript_received = False
try:
for response in responses:
self.get_logger().info('responses loop')
if time.time() - start_time > timeout_seconds:
self.get_logger().error("Timeout: No final result after %d seconds." % timeout_seconds)
sResponse.success = False
sResponse.message = "timeout"
break # Exit the loop if we've reached the timeout without a final result
# Check if there are any results in this response
if not response.results:
continue
# The first result is the most relevant for single utterance mode
result = response.results[0]
# Check if the result is final
if result.is_final:
final_transcript_received = True
# Extract the top alternative of the final result
top_transcript = result.alternatives[0].transcript
self.get_logger().info(f"Final transcript: {top_transcript}")
sResponse.success = True
sResponse.message = top_transcript
break # Exit the loop since we've received a final transcript
if not final_transcript_received:
self.get_logger().error("No final transcript received.")
sResponse.success = False
except Exception as e:
self.get_logger().error(f"Error during speech recognition: {e}")
sResponse.success = False
return sResponse
self.get_logger().debug('GSTTService complete request')
return sResponse
sResponse.success = False
sResponse.message = "No final result was obtained."
self.get_logger().info('No final result was obtained.')
return sResponse
else:
sResponse.success = False
sResponse.message = "The service must be called with 'True' to start recognition."
return sResponse
I am struggling to set a time limit to the server in order to recover from this case when no user utterance is detected. The streaming config is the following
self.streaming_config = speech.StreamingRecognitionConfig(
config=self.config,
single_utterance=True,
#enable_voice_activity_events=True, voice_activity_timeout=True, speech_end_timeout=10
)
I also tried to disable the last comment but it seems that speech_end_timeout is not recognized as parameter.
Thank you everyone!
I tried to use this code to block the streaming after some timeout but it doens't work