Client-server implementation for speech recognition with sphinx4

1k Views Asked by At

I try to recognize the speech, which was received by microphone from Android device(client side). After that, I send the DatagramPacket with the speech to server, which realizes speech recognition using Sphinx 4. But on the server side i don't get any result. I always get result==null. What is the problem?

client side

import android.media.AudioFormat;
import android.media.AudioRecord;
import android.media.MediaRecorder;


import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.DatagramPacket;
import java.net.DatagramSocket;
import java.net.InetAddress;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;

public class Recording {

    public byte[] buffer;
    public DatagramSocket socket;
    private int port =  8080;
    AudioRecord audioRecord;

    private int sampleRate = 16000;
    private int channelConfig = AudioFormat.CHANNEL_IN_MONO;
    private int audioFormat = AudioFormat.ENCODING_PCM_16BIT;

    private boolean status  = true;

    public void stopListener() {
        status = false;
        //audioRecord.stop();
        audioRecord.release();
    }

    public void startListener() {
        status = true;
        startStreaming();
    }

    private void startStreaming() {

        Thread streamThread =  new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    int minBufSize =  AudioRecord.getMinBufferSize(sampleRate, channelConfig, audioFormat);
                    System.out.println("minBufSize = " + minBufSize);
                    socket = new DatagramSocket();
                    System.out.println("Socket Created");

                    byte[] buffer =  new byte[minBufSize];

                    DatagramPacket packet;
                    final InetAddress destination =  InetAddress.getByName("192.168.0.74");
                    System.out.println("Ip address recieved");

                    socket.connect(destination,port);
                    System.out.println("Socket connected");

                    audioRecord = new AudioRecord(MediaRecorder.AudioSource.MIC,sampleRate,channelConfig,audioFormat,minBufSize);
                    System.out.println("Record initialized");
                    audioRecord.startRecording();


                    while (status)
                    {
                        //reading data from MIC into buffer
                        minBufSize = audioRecord.read(buffer,0,buffer.length);
                       // System.out.println("minBufSze were read " + minBufSize);


                        //putting buffer in the packet
                        packet = new DatagramPacket(buffer,buffer.length,destination,port);

                        socket.send(packet);
                    }

                    }

                } catch (SocketException e) {
                    e.printStackTrace();
                } catch (UnknownHostException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }

            }
        });
        streamThread.start();
    }
}

server side

import edu.cmu.sphinx.tools.audio.AudioData;

import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import java.io.*;
import java.net.DatagramPacket;
import java.net.DatagramSocket;
import edu.cmu.sphinx.api.Configuration;
import edu.cmu.sphinx.api.SpeechResult;
import edu.cmu.sphinx.api.StreamSpeechRecognizer;
import edu.cmu.sphinx.frontend.util.AudioFileDataSource;
import edu.cmu.sphinx.frontend.util.Microphone;
import edu.cmu.sphinx.frontend.util.StreamDataSource;
import edu.cmu.sphinx.recognizer.Recognizer;
import edu.cmu.sphinx.result.Lattice;
import edu.cmu.sphinx.result.LatticeOptimizer;
import edu.cmu.sphinx.result.Result;
import edu.cmu.sphinx.tools.audio.AudioData;
import edu.cmu.sphinx.util.props.ConfigurationManager;

import javax.sound.sampled.AudioInputStream;
import java.io.IOException;

public class Server {

    public static void main(String[] args) throws IOException {
        int port = 8080;
        int sampleRate = 16000;
        boolean status = true;
        AudioInputStream ais;
        AudioFormat format;
        boolean flag = false;
        Recognition recognition =  new Recognition();


        System.out.println("Welcome to server side.");
        DatagramSocket serverSocket =  new DatagramSocket(port);


        format =  new AudioFormat(sampleRate,16,1,true,false);
        byte[] receiveData = new byte[4096];

        while (status) {
            DatagramPacket receivePacket = new DatagramPacket(receiveData,receiveData.length);
            ByteArrayInputStream bais = new ByteArrayInputStream(receivePacket.getData());
            serverSocket.receive(receivePacket);
            for (int i=0; i<receiveData.length; i++)
            {
                out.println(receiveData[i]);
            }
            ais = new AudioInputStream(bais,format, receivePacket.getLength());
            System.out.println("ServerSocket is " + serverSocket.isConnected());
            recognition.Recognize(ais);

        }
    }
}


public class Recognition {

    ConfigurationManager cm;
    Recognizer recognizer;
    Configuration configuration;
    AudioInputStream audioInputStream;

    public Recognition() {
        configuration = new Configuration();

        cm = new ConfigurationManager(Recognition.class.getResource("/src/config.xml"));
        recognizer = (Recognizer) cm.lookup("recognizer");
        recognizer.allocate();

    }

    public void Recognize(AudioInputStream ais) {


            audioInputStream = ais;

            AudioFileDataSource dataSource = (AudioFileDataSource) cm.lookup("audioFileDataSource");
            dataSource.setInputStream(audioInputStream,"stream");

            Result result = recognizer.recognize();
            System.out.println("Say: (Good morning | Hello) ( Bhiksha | Evandro | Paul | Philip | Rita | Will )");
            if (result != null) {
                Lattice lattice = new Lattice(result);
                LatticeOptimizer optimizer = new LatticeOptimizer(lattice);
                optimizer.optimize();
                lattice.dumpAllPaths();
                String resultText = result.getBestResultNoFiller();
                System.out.println("I heard: " + resultText + '\n');
            } else {
                System.out.println("I could't hear you!");
            }
    }
}

config.xml

<?xml version="1.0" encoding="UTF-8"?>

<!--
   Sphinx-4 Configuration file
-->

<!-- ******************************************************** -->
<!--  biship  configuration file                              -->
<!-- ******************************************************** -->

<config>        
    <!-- ******************************************************** -->
    <!-- frequently tuned properties                              -->
    <!-- ******************************************************** --> 
    <property name="absoluteBeamWidth"  value="500"/>
    <property name="relativeBeamWidth"  value="1E-80"/>
    <property name="absoluteWordBeamWidth" value="20"/>
    <property name="relativeWordBeamWidth" value="1E-60"/>
    <property name="wordInsertionProbability" value="1E-16"/>
    <property name="languageWeight" value="7.0"/>
    <property name="silenceInsertionProbability" value=".1"/>
    <property name="frontend" value="epFrontEnd"/>
    <property name="recognizer" value="recognizer"/>
    <property name="showCreations" value="false"/>


    <!-- ******************************************************** -->
    <!-- word recognizer configuration                            -->
    <!-- ******************************************************** --> 

    <component name="recognizer" 
                          type="edu.cmu.sphinx.recognizer.Recognizer">
        <property name="decoder" value="decoder"/>
        <propertylist name="monitors">
            <item>accuracyTracker </item>
            <item>speedTracker </item>
            <item>memoryTracker </item>
            <item>recognizerMonitor </item>
        </propertylist>
    </component>

    <!-- ******************************************************** -->
    <!-- The Decoder   configuration                              -->
    <!-- ******************************************************** --> 

    <component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">
        <property name="searchManager" value="wordPruningSearchManager"/>
        <property name="featureBlockSize" value="50"/>
    </component>

    <!-- ******************************************************** -->
    <!-- The Search Manager                                       -->
    <!-- ******************************************************** --> 

    <component name="wordPruningSearchManager" 
    type="edu.cmu.sphinx.decoder.search.WordPruningBreadthFirstSearchManager">
        <property name="logMath" value="logMath"/>
        <property name="linguist" value="lexTreeLinguist"/>
        <property name="pruner" value="trivialPruner"/>
        <property name="scorer" value="threadedScorer"/>
        <property name="activeListManager" value="activeListManager"/>
        <property name="growSkipInterval" value="0"/>
        <property name="checkStateOrder" value="false"/>
        <property name="buildWordLattice" value="true"/>
        <property name="acousticLookaheadFrames" value="1.7"/>
        <property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
    </component>


    <!-- ******************************************************** -->
    <!-- The Active Lists                                         -->
    <!-- ******************************************************** --> 

    <component name="activeListManager" 
             type="edu.cmu.sphinx.decoder.search.SimpleActiveListManager">
        <propertylist name="activeListFactories">
        <item>standardActiveListFactory</item>
        <item>wordActiveListFactory</item>
        <item>wordActiveListFactory</item>
        <item>standardActiveListFactory</item>
        <item>standardActiveListFactory</item>
        <item>standardActiveListFactory</item>
    </propertylist>
    </component>

    <component name="standardActiveListFactory" 
             type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
        <property name="logMath" value="logMath"/>
        <property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
        <property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
    </component>

    <component name="wordActiveListFactory" 
             type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
        <property name="logMath" value="logMath"/>
        <property name="absoluteBeamWidth" value="${absoluteWordBeamWidth}"/>
        <property name="relativeBeamWidth" value="${relativeWordBeamWidth}"/>
    </component>

    <!-- ******************************************************** -->
    <!-- The Pruner                                               -->
    <!-- ******************************************************** --> 
    <component name="trivialPruner" 
                type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>

    <!-- ******************************************************** -->
    <!-- TheScorer                                                -->
    <!-- ******************************************************** --> 
    <component name="threadedScorer" 
                type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">
        <property name="frontend" value="${frontend}"/>
    </component>

    <!-- ******************************************************** -->
    <!-- The linguist  configuration                              -->
    <!-- ******************************************************** -->

    <component name="lexTreeLinguist" 
                type="edu.cmu.sphinx.linguist.lextree.LexTreeLinguist">
        <property name="logMath" value="logMath"/>
        <property name="acousticModel" value="wsj"/>
        <property name="languageModel" value="trigramModel"/>
        <property name="dictionary" value="dictionary"/>
        <property name="addFillerWords" value="false"/>
        <property name="fillerInsertionProbability" value="1E-10"/>
        <property name="generateUnitStates" value="false"/>
        <property name="wantUnigramSmear" value="true"/>
        <property name="unigramSmearWeight" value="1"/>
        <property name="wordInsertionProbability" 
                value="${wordInsertionProbability}"/>
        <property name="silenceInsertionProbability" 
                value="${silenceInsertionProbability}"/>
        <property name="languageWeight" value="${languageWeight}"/>
        <property name="unitManager" value="unitManager"/>
    </component>    


    <!-- ******************************************************** -->
    <!-- The Dictionary configuration                            -->
    <!-- ******************************************************** -->
    <component name="dictionary" 
        type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
        <property name="dictionaryPath"
                  value="resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/cmudict.0.6d"/>
        <property name="fillerPath" 
              value="resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/noisedict"/>
        <property name="addSilEndingPronunciation" value="false"/>
        <property name="wordReplacement" value="&lt;sil&gt;"/>
        <property name="unitManager" value="unitManager"/>
    </component>


    <!-- ******************************************************** -->
    <!-- The Language Model configuration                         -->
    <!-- ******************************************************** -->
    <component name="trigramModel" 
          type="edu.cmu.sphinx.linguist.language.ngram.large.LargeTrigramModel">
        <property name="unigramWeight" value=".5"/>
        <property name="maxDepth" value="3"/>
        <property name="logMath" value="logMath"/>
        <property name="dictionary" value="dictionary"/>
        <property name="location"
         value="./models/language/en-us.lm.dmp"/>
    </component>


    <!-- ******************************************************** -->
    <!-- The acoustic model configuration                         -->
    <!-- ******************************************************** -->
    <component name="wsj"
               type="edu.cmu.sphinx.linguist.acoustic.tiedstate.TiedStateAcousticModel">
        <property name="loader" value="wsjLoader"/>
        <property name="unitManager" value="unitManager"/>
    </component>

    <component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.Sphinx3Loader">
        <property name="logMath" value="logMath"/>
        <property name="unitManager" value="unitManager"/>
        <property name="location" value="resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz"/>
    </component>

    <!-- ******************************************************** -->
    <!-- The unit manager configuration                           -->
    <!-- ******************************************************** -->

    <component name="unitManager" 
        type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>


    <!-- ******************************************************** -->
    <!-- The frontend configuration                               -->
    <!-- ******************************************************** -->

    <component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
        <propertylist name="pipeline">
            <item>audioFileDataSource </item>
            <item>dataBlocker </item>
            <item>speechClassifier </item>
            <item>speechMarker </item>
            <item>nonSpeechDataFilter </item>
            <item>preemphasizer </item>
            <item>windower </item>
            <item>fft </item>
            <item>melFilterBank </item>
            <item>dct </item>
            <item>liveCMN </item>
            <item>featureExtraction </item>
        </propertylist>
    </component>

    <component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/>


    <component name="microphone" 
                type="edu.cmu.sphinx.frontend.util.Microphone">
        <property name="closeBetweenUtterances" value="false"/>
    </component>

    <component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker"/>

    <component name="speechClassifier"
                type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier">
        <property name="threshold" value="13"/>
    </component>

    <component name="nonSpeechDataFilter" 
                type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>

    <component name="speechMarker" 
                type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker">
        <property name="speechTrailer" value="50"/>
    </component>

    <component name="preemphasizer"
        type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>

    <component name="windower" 
    type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower"/>

    <component name="fft" 
        type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform"/>

    <component name="melFilterBank" 
        type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank"/>

    <component name="dct" 
            type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>

    <component name="liveCMN" 
                type="edu.cmu.sphinx.frontend.feature.LiveCMN"/>

    <component name="featureExtraction" 
        type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>

    <!-- ******************************************************* -->
    <!--  monitors                                               -->
    <!-- ******************************************************* -->

    <component name="accuracyTracker" 
                type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker">
        <property name="recognizer" value="${recognizer}"/>
        <property name="showRawResults" value="false"/>
        <property name="showAlignedResults" value="false"/>
    </component>

    <component name="memoryTracker" 
                type="edu.cmu.sphinx.instrumentation.MemoryTracker">
        <property name="recognizer" value="${recognizer}"/>
    <property name="showDetails" value="false"/>
    <property name="showSummary" value="false"/>
    </component>

    <component name="speedTracker" 
                type="edu.cmu.sphinx.instrumentation.SpeedTracker">
        <property name="recognizer" value="${recognizer}"/>
        <property name="frontend" value="${frontend}"/>
    <property name="showDetails" value="false"/>
    </component>

    <component name="recognizerMonitor" 
                type="edu.cmu.sphinx.instrumentation.RecognizerMonitor">
        <property name="recognizer" value="${recognizer}"/>
        <propertylist name="allocatedMonitors">
            <item>configMonitor </item>
        </propertylist>
    </component>

    <component name="configMonitor" 
                type="edu.cmu.sphinx.instrumentation.ConfigMonitor">
        <property name="showConfig" value="false"/>
    </component>


    <!-- ******************************************************* -->
    <!--  Miscellaneous components                               -->
    <!-- ******************************************************* -->

    <component name="logMath" type="edu.cmu.sphinx.util.LogMath">
        <property name="logBase" value="1.0001"/>
        <property name="useAddTable" value="true"/>
    </component>
</config>
2

There are 2 best solutions below

4
On BEST ANSWER

Configuration file is wrong. Proper config is in latest sphinx sources named default.config.xml. It's better to use high-level API without configuration files as described in the tutorial

http://cmusphinx.sourceforge.net/wiki/tutorialsphinx4

The code to receive the data is also wrong, you need to just pass socket stream with audio data to recognizer API and recognizer will retrieve packets by itself. Otherwise you have to implement a special sphinx4 data source to receive packets from the server.

To recognize raw audio from a socket stream:

StreamSpeechRecognizer recognizer = new StreamSpeechRecognizer(configuration);
recognizer.startRecognition(clientSocket.getInputStream()); // Here we tell recognizer to read data from socket stream
SpeechResult result = recognizer.getResult();
recognizer.stopRecognition();
0
On

I tried to use StreamSpeechRecognition and DataInputStream and also I got result==null. I really don't understand what is the problem.

server side

import edu.cmu.sphinx.frontend.util.StreamDataSource;
import edu.cmu.sphinx.tools.audio.AudioData;

import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import java.io.*;
import java.net.DatagramPacket;
import java.net.DatagramSocket;
import edu.cmu.sphinx.api.Configuration;
import edu.cmu.sphinx.api.SpeechResult;
import edu.cmu.sphinx.api.StreamSpeechRecognizer;
import edu.cmu.sphinx.frontend.util.AudioFileDataSource;
import edu.cmu.sphinx.frontend.util.Microphone;
import edu.cmu.sphinx.frontend.util.StreamDataSource;
import edu.cmu.sphinx.recognizer.Recognizer;
import edu.cmu.sphinx.result.Lattice;
import edu.cmu.sphinx.result.LatticeOptimizer;
import edu.cmu.sphinx.result.Result;
import edu.cmu.sphinx.tools.audio.AudioData;
import edu.cmu.sphinx.util.props.ConfigurationManager;

import javax.sound.sampled.AudioInputStream;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;


public class Server {

    public static void main(String[] args) throws IOException {
        int port = 8080;
        int sampleRate = 16000;
        boolean status = true;
        Recognition recognition =  new Recognition();

        System.out.println("Welcome to server side.");
        DatagramSocket serverSocket =  new DatagramSocket(port);

        byte[] receiveData = new byte[4096];

        while (status) {
            DatagramPacket receivePacket = new DatagramPacket(receiveData,receiveData.length);
            ByteArrayInputStream bais = new ByteArrayInputStream(receivePacket.getData());

            serverSocket.receive(receivePacket);
            recognition.Recognize(bais);
        }
    }
}



public class Recognition {

    Configuration configuration;

    public Recognition() {
        configuration = new Configuration();
        configuration.setAcousticModelPath("resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz");
        configuration.setDictionaryPath("resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/cmudict.0.6d");
        configuration.setLanguageModelPath("models/language/en-us.lm.dmp");
    }

    public void Recognize(ByteArrayInputStream bais) {

        try {
            StreamSpeechRecognizer streamSpeechRecognizer = new StreamSpeechRecognizer(configuration);
            streamSpeechRecognizer.startRecognition(bais);

            System.out.println("Say: (Good morning | Hello) ( Bhiksha | Evandro | Paul | Philip | Rita | Will )");

                SpeechResult result = streamSpeechRecognizer.getResult();

                if (result != null) {
                    String resultText = result.getHypothesis();
                    System.out.println("You said: " + resultText + '\n');
                }

        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }
}