Google Speech-To-Text v2 doesn't accept audio in Node.JS

236 Views Asked by At

I've been trying for couple days now to migrate to Google STT V2 using Node.JS. In v1 everything worked perfectly. I have created a recognizer and created a script using https://github.com/GoogleCloudPlatform/nodejs-docs-samples/blob/main/speech/transcribeStreaming.v2.js

My point is to transcribe audio coming from Twilio phone call and I'm using websockets for Twilio to connect to a WSS and stream audio data, which I pass to Google streamingRecognition. My code looks like this:

const speech = require('@google-cloud/speech').v2;
const fs = require('fs');

const client = new speech.SpeechClient({
  keyFilename: './googlecreds.json',
  apiEndpoint: 'eu-speech.googleapis.com'
});

const recognizerName = "projects/12345678910/locations/eu/recognizers/name";

const recognitionConfig = {
  audoDecodingConfig: {},
};

const streamingConfig = {
  config: recognitionConfig,
};

const configRequest = {
  recognizer: recognizerName,
  streamingConfig: streamingConfig,
};

const express = require('express');
const bodyParser = require('body-parser');
const app = express();
app.use(bodyParser.urlencoded({ extended: true }));

// Load your key and certificate
const privateKey = fs.readFileSync('location', 'utf8');
const certificate = fs.readFileSync('location', 'utf8');
const ca = fs.readFileSync('location', 'utf8');

const credentials = {
  key: privateKey,
  cert: certificate,
  ca: ca
};

//wss
const WebSocket = require('ws');
const https = require('https');
const server = https.createServer(credentials, app);
const wss = new WebSocket.Server({ 
  server: server, 
  path: '/stream',
});

wss.on("connection", async function connection(ws) {
    let recognizeStream = null;
    ws.on("message", function incoming(message) {
        const msg = JSON.parse(message);
        switch (msg.event) {
            case "start":
                recognizeStream = client
                ._streamingRecognize()
                .on('data', response => {
                  const {results} = response;
                  console.log(results[0].alternatives[0].transcript);
                })
                .on('error', err => {
                  console.error(err.message);
                })
                recognizeStream.write(configRequest);
                break;
            case "media":
                // Write the raw media data to the recognize stream
                recognizeStream.write({audio: msg.media.payload});
                break;
            case "stop":
                // Stop the recognize stream
                recognizeStream.end();
                break;
        }
    });
});

app.post('/voice', (req, res) => {
  twiml = `
<Response>
    <Say>talk now</Say>
    <Connect>
        <Stream url="wss://my.domain.com/stream"/>
    </Connect>
    <Pause length="60"/>
</Response>
`
  res.type('text/xml');
  res.send(twiml);
});


const port = process.env.PORT || 8080;
server.listen(port, '0.0.0.0', () => {
  console.log(`Server running on port ${port}`);
});

Stream is connected, config writes without an error. I can log received msg.media.payload from Twilio in my "media" case, but writing it to recognizeStream does nothing, I get no answers. I'm not sure what to do anymore.

1

There are 1 best solutions below

0
zag2art On BEST ANSWER

Working on the same feature. Was able to resolve. Two fixes:

1 config

const recognitionConfig = {
  explicitDecodingConfig: {
    encoding: 'MULAW',
    sampleRateHertz: 8000,
    audioChannelCount: 1
  }
}

2 buffer conversion

const buffer = Buffer.from(msg.media.payload, 'base64')
recognizeStream?.write({ audio: buffer })