Bad types trying to use audio recognition loopback

375 Views Asked by At

The Microsoft Speech system has a good example code, but I have a problem when adding loopback to record what it is playing not what is coming through mic. To give text description of a video for example while not playing it on the speaker. It seems like this is the library to do it but I'm getting type errors pushing it to the audio stream of the recognizer:

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                capture.DataAvailable += (s, a) =>
                {
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                    captureStream.Flush();
                };
                capture.StartRecording();
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);
                recognizer.SetInputToAudioStream(captureStream, new SpeechAudioFormatInfo(
                    capture.WaveFormat.AverageBytesPerSecond, AudioBitsPerSample.Sixteen, AudioChannel.Stereo));

                // Start asynchronous, continuous speech recognition.  
                recognizer.RecognizeAsync(RecognizeMode.Multiple);

                // Keep the console window open.  
                while (true)
                {
                    Console.ReadLine();
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}

Updated As you can see in the revised code, this is compiling at least now, but does not recognize any speech internal or external. In fact it outputs:

384000
32

So since there is no "thirtytwo" on AudioBitsPerSample perhaps I can't even use NAudio class to get system audio??

Update This seems to work somewhat, based on another answer but it doesn't pick up very much, I'm thinking it's sending slooow or fast audio in perhaps?

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{

    class FakeStreamer : Stream
    {
        public bool bExit = false;
        Stream stream;
        Stream client;
        public FakeStreamer(Stream client)
        {
            this.client = client;
            this.stream = client;
        }
        public override bool CanRead
        {
            get { return stream.CanRead; }
        }

        public override bool CanSeek
        {
            get { return false; }
        }

        public override bool CanWrite
        {
            get { return stream.CanWrite; }
        }

        public override long Length
        {
            get { return -1L; }
        }

        public override long Position
        {
            get { return 0L; }
            set { }
        }
        public override long Seek(long offset, SeekOrigin origin)
        {
            return 0L;
        }

        public override void SetLength(long value)
        {
            stream.SetLength(value);
        }
        public override int Read(byte[] buffer, int offset, int count)
        {
            int len = 0, c = count;
            while (c > 0 && !bExit)
            {
                try
                {
                    len = stream.Read(buffer, offset, c);
                }
                catch (Exception e)
                {
                    Console.WriteLine("ouch");
                }
                /*if (!client.Connected || len == 0)
                {
                    //Exit read loop
                    return 0;
                }*/
                offset += len;
                c -= len;
            }
            return count;
        }

        public override void Write(byte[] buffer, int offset, int count)
        {
            stream.Write(buffer, offset, count);
        }

        public override void Close()
        {
            stream.Close();
            base.Close();
        }

        public override void Flush()
        {
            stream.Flush();
        }
    }
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                Stream buffStream = new FakeStreamer(captureStream);
                capture.DataAvailable += (s, a) =>
                {
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                };
                capture.StartRecording();
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);

                //recognizer.SetInputToDefaultAudioDevice();
                recognizer.SetInputToAudioStream(buffStream, new SpeechAudioFormatInfo(
                    capture.WaveFormat.AverageBytesPerSecond/4, AudioBitsPerSample.Eight, AudioChannel.Stereo));

                // Start asynchronous, continuous speech recognition.  
                recognizer.RecognizeAsync(RecognizeMode.Multiple);

                // Keep the console window open.  
                while (true)
                {
                    Console.ReadLine();
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}

Update 3 - Trying to recode the audio stream to what voice recognition will do: unfortunately it does not get to the capture recoded audio as you can see...

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{

    class FakeStreamer : Stream
    {
        public bool bExit = false;
        Stream stream;
        Stream client;
        public FakeStreamer(Stream client)
        {
            this.client = client;
            this.stream = client;
        }
        public override bool CanRead
        {
            get { return stream.CanRead; }
        }

        public override bool CanSeek
        {
            get { return false; }
        }

        public override bool CanWrite
        {
            get { return stream.CanWrite; }
        }

        public override long Length
        {
            get { return -1L; }
        }

        public override long Position
        {
            get { return 0L; }
            set { }
        }
        public override long Seek(long offset, SeekOrigin origin)
        {
            return 0L;
        }

        public override void SetLength(long value)
        {
            stream.SetLength(value);
        }
        public override int Read(byte[] buffer, int offset, int count)
        {
            int len = 0, c = count;
            while (c > 0 && !bExit)
            {
                try
                {
                    len = stream.Read(buffer, offset, c);
                }
                catch (Exception e)
                {
                    Console.WriteLine("ouch");
                }
                /*if (!client.Connected || len == 0)
                {
                    //Exit read loop
                    return 0;
                }*/
                offset += len;
                c -= len;
            }
            return count;
        }

        public override void Write(byte[] buffer, int offset, int count)
        {
            stream.Write(buffer, offset, count);
        }

        public override void Close()
        {
            stream.Close();
            base.Close();
        }

        public override void Flush()
        {
            stream.Flush();
        }
    }
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                //Stream buffStream = new FakeStreamer(captureStream);
                capture.DataAvailable += (s, a) =>
                {
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                };
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);
                var newFormat = new WaveFormat(8000, 16, 1);
                //using (var conversionStream = new WaveFormatConversionStream(newFormat, capture)

                var resampler = new MediaFoundationResampler(new NAudio.Wave.RawSourceWaveStream(captureStream,capture.WaveFormat), newFormat);
                Stream captureConvertStream = new System.IO.MemoryStream();
                resampler.ResamplerQuality = 60;
                    //WaveFileWriter.WriteWavFileToStream(captureConvertStream, resampler);
                    //recognizer.SetInputToDefaultAudioDevice();
                    Stream buffStream = new FakeStreamer(captureConvertStream);

                recognizer.SetInputToAudioStream(buffStream, new SpeechAudioFormatInfo(
                    8000, AudioBitsPerSample.Sixteen, AudioChannel.Mono));

                // Start asynchronous, continuous speech recognition.  
                recognizer.RecognizeAsync(RecognizeMode.Multiple);

                capture.StartRecording();
                var arr = new byte[128];
                while (resampler.Read(arr, 0, arr.Length) > 0)
                {
                    captureConvertStream.Write(arr, 0, arr.Length);
                    Console.WriteLine("Never getting here");
                }
                // Keep the console window open.  
                while (true)
                {
                    Console.ReadLine();
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}
0

There are 0 best solutions below