I am using Google APIs speech-to-text to transcript audio files (wav files) that are stored in GCS bucket. The audio files are phone records and have 3 speakers ( IVR, Customer, and Engineer) and the transcripts i get either when using diarization or not gets only the IVR and the Engineer parts of the conversation. The Customer side of the conversation is totally missing which gives an indication that this is not a transcription error rather than a configuration issue in my script or the way i use the APIs
below is payload I am using in the POST request
from googleapiclient.discovery import build
from google.oauth2 import service_account
import json
from google.cloud import storage
import requests
import time
import pprint
endpoint = "https://speech.googleapis.com/v1/speech:longrunningrecognize"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer " + <access_token>
}
data = {
"config": {
"encoding": "LINEAR16",
"languageCode": "en-US",
"audioChannelCount": 2,
"diarizationConfig": {
"enableSpeakerDiarization": True,
"minSpeakerCount": 3,
"maxSpeakerCount": 3
},
"model": "phone_call"
},
"audio": {
"uri": f"gs://store_media_files/record5.wav"
}
}
response = requests.post(
endpoint,
headers=headers,
json=data
)
json_response = response.json()
operation_name = json_response["name"]
while True:
response = requests.get(
f"https://speech.googleapis.com/v1/operations/{operation_name}",
headers=headers
)
json_response = response.json()
if "done" in json_response and json_response["done"]:
results = json_response["response"]["results"]
pprint.pprint(results)
Any suggestions on what is missing/wrong in this configuration so it skips one of the speakers in the transcription process?
I was able to fix it by adding this line in the data playload
so the config data is