I'm building a discord bot that uses an Opus stream. I've tried various things such as sending the Opus packets up directly, decoding the OPUS stream to a PCM and encoding it into a byte array, and converting the PCM to a byte array directly. In all cases I get:
Could not recognize: code:11 message:"Audio data is being streamed too slow. Please stream audio data approximately at real time."
I've tried 8kHz-48kHz frequencies at 20 ms frames. I've also tried to encode the converted PCM with the max bitrate. I have run the sample code successfully, so there is no connection issue on my end. Where should I look for a solution?
package main
import (
"fmt"
//"io"
"log"
"os"
"flag"
speech "cloud.google.com/go/speech/apiv1"
"golang.org/x/net/context"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
"github.com/bwmarrin/discordgo"
"layeh.com/gopus"
)
// OnError gets called by dgvoice when an error is encountered.
// By default logs to STDERR
var OnError = func(str string, err error) {
prefix := "dgVoice: " + str
if err != nil {
os.Stderr.WriteString(prefix + ": " + err.Error())
} else {
os.Stderr.WriteString(prefix)
}
}
var stream speechpb.Speech_StreamingRecognizeClient
func main() {
var (
Token = flag.String("t", "", "Discord bot token.")
// Email = flag.String("e", "", "Discord account email.")
// Password = flag.String("p", "", "Discord account password.")
GuildID = flag.String("g", "", "Guild ID")
ChannelID = flag.String("c", "", "Channel ID")
)
flag.Parse()
fmt.Println("Connecting to Discord...")
// Connect to Discord
discord, err := discordgo.New(*Token)
if err != nil {
fmt.Println(err)
return
}
fmt.Println("Opening Socket...")
// Open Websocket
err = discord.Open()
if err != nil {
fmt.Println(err)
return
}
fmt.Println("Joining Channel...")
// Connect to voice channel.
// NOTE: Setting mute to false, deaf to true.
dgv, err := discord.ChannelVoiceJoin(*GuildID, *ChannelID, false, false)
if err != nil {
fmt.Println(err)
return
}
fmt.Println("Connecting to Google Speech Recognition API...")
ctx := context.Background()
// [START speech_streaming_mic_recognize]
client, err := speech.NewClient(ctx)
if err != nil {
log.Fatal(err)
}
stream, err = client.StreamingRecognize(ctx)
if err != nil {
log.Fatal(err)
}
// Send the initial configuration message.
if err := stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
StreamingConfig: &speechpb.StreamingRecognitionConfig{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 16000,
LanguageCode: "en-US",
},
//InterimResults: true,
SingleUtterance: true,
},
},
}); err != nil {
log.Fatal(err)
}
recv := make(chan *discordgo.Packet, 2)
go Receive(dgv, recv)
send := make(chan []int16, 2)
go Send(dgv, send)
// dgv.Speaking(true)
// defer dgv.Speaking(false)
go func() {
for {
p, ok := <-recv
if !ok {
fmt.Println("Not OK")
return
}
send <- p.PCM
}
} ()
for {
resp, err := stream.Recv()
//fmt.Printf("%+v\n",resp)
if err != nil {
log.Fatalf("Cannot stream results: %v", err)
}
if err := resp.Error; err != nil {
log.Fatalf("Could not recognize: %v", err)
}
for _, result := range resp.Results {
fmt.Printf("Result: %+v\n", result)
}
}
// Close connections
dgv.Close()
discord.Close()
return
}
func Receive(v *discordgo.VoiceConnection, c chan *discordgo.Packet) {
var speakers map[uint32]*gopus.Decoder
if c == nil {
return
}
var err error
for {
p, ok := <-v.OpusRecv
if !ok {
return
}
if speakers == nil {
speakers = make(map[uint32]*gopus.Decoder)
}
_, ok = speakers[p.SSRC]
if !ok {
speakers[p.SSRC], err = gopus.NewDecoder(16000, 1)
if err != nil {
OnError("error creating opus decoder", err)
continue
}
}
p.PCM, err = speakers[p.SSRC].Decode(p.Opus, 320, false)
if err != nil {
OnError("Error decoding opus data", err)
continue
}
// try encoding pcm frame with Opus
c <- p
}
}
func Send(v *discordgo.VoiceConnection, pcm <- chan []int16) {
for {
// read pcm from chan, exit if channel is closed.
recv, ok := <-pcm
if !ok {
OnError("PCM Channel closed", nil)
return
}
buf := make([]byte,2*len(recv))
for i := 0; i < len(recv); i+=2 {
var h, l uint8 = uint8(i>>8), uint8(i&0xff)
buf[i] = h
buf[i+1] = l
}
stream.Send(&speechpb.StreamingRecognizeRequest{
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
AudioContent: buf,
},
});
}
}
The Google Speech-to-Text documentation has a fully working example of streaming speech recognition in Go.
"Audio data is being streamed too slow" is sent by the server when it is not receiving audio in realtime. In this case, the above code contains a bug that results in only half a PCM frame being sent on each iteration of the Send loop:
recv is a slice of int16 values, so it should be iterated over one value at a time, not
i+=2
, which skips every other value.buf
is a uint8 slice, so the indexing for that is valid.