How should I encode an audio stream for Google Voice recognition in golang?

1.8k Views Asked by At

I'm building a discord bot that uses an Opus stream. I've tried various things such as sending the Opus packets up directly, decoding the OPUS stream to a PCM and encoding it into a byte array, and converting the PCM to a byte array directly. In all cases I get:

Could not recognize: code:11 message:"Audio data is being streamed too slow. Please stream audio data approximately at real time."

I've tried 8kHz-48kHz frequencies at 20 ms frames. I've also tried to encode the converted PCM with the max bitrate. I have run the sample code successfully, so there is no connection issue on my end. Where should I look for a solution?

package main

import (
    "fmt"
    //"io"
    "log"
    "os"
    "flag"

    speech "cloud.google.com/go/speech/apiv1"
    "golang.org/x/net/context"
    speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"

    "github.com/bwmarrin/discordgo"
    "layeh.com/gopus"
)

// OnError gets called by dgvoice when an error is encountered.
// By default logs to STDERR
var OnError = func(str string, err error) {
    prefix := "dgVoice: " + str

    if err != nil {
        os.Stderr.WriteString(prefix + ": " + err.Error())
    } else {
        os.Stderr.WriteString(prefix)
    }
}

var stream speechpb.Speech_StreamingRecognizeClient

func main() {
    var (
        Token     = flag.String("t", "", "Discord bot token.")
//      Email     = flag.String("e", "", "Discord account email.")
//      Password  = flag.String("p", "", "Discord account password.")
        GuildID   = flag.String("g", "", "Guild ID")
        ChannelID = flag.String("c", "", "Channel ID")
    )
    flag.Parse()


    fmt.Println("Connecting to Discord...")
    // Connect to Discord
    discord, err := discordgo.New(*Token)
    if err != nil {
        fmt.Println(err)
        return
    }

    fmt.Println("Opening Socket...")
    // Open Websocket
    err = discord.Open()
    if err != nil {
        fmt.Println(err)
        return
    }

    fmt.Println("Joining Channel...")
    // Connect to voice channel.
    // NOTE: Setting mute to false, deaf to true.
    dgv, err := discord.ChannelVoiceJoin(*GuildID, *ChannelID, false, false)
    if err != nil {
        fmt.Println(err)
        return
    }

    fmt.Println("Connecting to Google Speech Recognition API...")
    ctx := context.Background()

    // [START speech_streaming_mic_recognize]
    client, err := speech.NewClient(ctx)
    if err != nil {
        log.Fatal(err)
    }
    stream, err = client.StreamingRecognize(ctx)
    if err != nil {
        log.Fatal(err)
    }
    // Send the initial configuration message.
    if err := stream.Send(&speechpb.StreamingRecognizeRequest{
        StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
            StreamingConfig: &speechpb.StreamingRecognitionConfig{
                Config: &speechpb.RecognitionConfig{
                    Encoding:        speechpb.RecognitionConfig_LINEAR16,
                    SampleRateHertz: 16000,
                    LanguageCode:    "en-US",
                },
                //InterimResults: true,
                SingleUtterance: true,
            },

        },
    }); err != nil {
        log.Fatal(err)
    }


    recv := make(chan *discordgo.Packet, 2)
    go Receive(dgv, recv)

    send := make(chan []int16, 2)
    go Send(dgv, send)

    // dgv.Speaking(true)
 //     defer dgv.Speaking(false)

    go func() {
        for {

            p, ok := <-recv
            if !ok {
                fmt.Println("Not OK")
                return
            }

            send <- p.PCM
        }

    } ()

    for {
        resp, err := stream.Recv()
        //fmt.Printf("%+v\n",resp)
        if err != nil {
            log.Fatalf("Cannot stream results: %v", err)
        }
        if err := resp.Error; err != nil {
            log.Fatalf("Could not recognize: %v", err)
        }
        for _, result := range resp.Results {
            fmt.Printf("Result: %+v\n", result)
        }
    }

    // Close connections
    dgv.Close()
    discord.Close()

    return
}

func Receive(v *discordgo.VoiceConnection, c chan *discordgo.Packet) {
    var speakers    map[uint32]*gopus.Decoder
    if c == nil {
        return
    }

    var err error
    for {
        p, ok := <-v.OpusRecv
        if !ok {
            return
        }

        if speakers == nil {
            speakers = make(map[uint32]*gopus.Decoder)
        }

        _, ok = speakers[p.SSRC]
        if !ok {
            speakers[p.SSRC], err = gopus.NewDecoder(16000, 1)
            if err != nil {
                OnError("error creating opus decoder", err)
                continue
            }
        }
        p.PCM, err = speakers[p.SSRC].Decode(p.Opus, 320, false)
        if err != nil {
            OnError("Error decoding opus data", err)
            continue
        }

        // try encoding pcm frame with Opus

        c <- p
    }
}

func Send(v *discordgo.VoiceConnection,  pcm <- chan []int16) {
    for {

        // read pcm from chan, exit if channel is closed.
        recv, ok := <-pcm
        if !ok {
            OnError("PCM Channel closed", nil)
            return
        }


        buf := make([]byte,2*len(recv))

        for i := 0; i < len(recv); i+=2 {
            var h, l uint8 = uint8(i>>8), uint8(i&0xff)
            buf[i] = h
            buf[i+1] = l
        }


        stream.Send(&speechpb.StreamingRecognizeRequest{
            StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
                AudioContent: buf,
            },
        });
    }

}

1

There are 1 best solutions below

0
On

The Google Speech-to-Text documentation has a fully working example of streaming speech recognition in Go.

"Audio data is being streamed too slow" is sent by the server when it is not receiving audio in realtime. In this case, the above code contains a bug that results in only half a PCM frame being sent on each iteration of the Send loop:

        for i := 0; i < len(recv); i+=2 {
            var h, l uint8 = uint8(i>>8), uint8(i&0xff)
            buf[i] = h
            buf[i+1] = l
        }

recv is a slice of int16 values, so it should be iterated over one value at a time, not i+=2, which skips every other value. buf is a uint8 slice, so the indexing for that is valid.