pocketsphinx hotword detection not working

1k Views Asked by At

I'm trying to build a small piece of software that detect a hotword using the CMU Sphinx Speech Recognition Toolkit (pocketsphinx).

  1. I created a corpus file with 2 words, one is HELP
  2. Using the model tool aquired the model.... (http://www.speech.cs.cmu.edu/tools/lmtool-new.html)
  3. I get too many hotword even when no one say the hotword....

What I'm missing?

Here is the code:

#include "stdafx.h"
#include <windows.h>
#include <sphinxbase/err.h>
#include <sphinxbase/ad.h>
#include <pocketsphinx.h>

using namespace System;

#define HOTWORD_KEY "hotwordsearch"
#define LM_KEY "lmsearch"
static const arg_t args_def[] = {
    POCKETSPHINX_OPTIONS,
    CMDLN_EMPTY_OPTION
};

const char *keyphrase = NULL;

ad_rec_t* open_recording_device(ps_decoder_t *ps, cmd_ln_t *config)
{
    ad_rec_t *ad;
    int samprate = (int)cmd_ln_float32_r(ps_get_config(ps), "-samprate");
    if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), samprate)) == NULL) {
        E_ERROR("Failed to open audio device\n");
        return NULL;
    }
    if (ad_start_rec(ad) < 0) {
        E_ERROR("Failed to start recording\n");
        return NULL;
    }
    return ad;
}

char const *acquire_from_mic(ps_decoder_t *ps, ad_rec_t *ad, int need_final)
{
    int16 adbuf[4096];
    uint8 utt_started, in_speech;
    int32 k, score=0;
    char const *hyp;

    if (ps_start_utt(ps) < 0) {
        E_ERROR("Failed to start utterance\n");
        return NULL;
    }
    utt_started = FALSE;
    E_INFO("Ready....\n");

    for (;;) {
        if ((k = ad_read(ad, adbuf, 4096)) < 0)
            E_FATAL("Failed to read audio\n");
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
            E_INFO("Listening...\n");
        }
        if (!in_speech && utt_started){ 
            /* speech -> silence transition, time to start new utterance  */
            ps_end_utt(ps);

            hyp = NULL;
            //hyp = ps_get_hyp_final(ps, &score);
            hyp = ps_get_hyp(ps, &score);
            if ((hyp != NULL)/*&&(score>0)*/) {
                E_INFO("---> score = %d\n", score);
                E_INFO("---> hyp   = %s \n", hyp);
                return hyp;
            }

            if (ps_start_utt(ps) < 0) {
                E_ERROR("Failed to start utterance\n");
                return NULL;
            }
            utt_started = FALSE;
            E_INFO("Ready again....\n");
        }
        Sleep(10);
    }

    return NULL;
}

int wait_for_hotword(ps_decoder_t *ps, ad_rec_t *ad)
{
    if (ps_set_search(ps, HOTWORD_KEY) < 0) {
        E_ERROR("Couldn't set hotwordsearch\n");
        return 0;
    }

    if (keyphrase == NULL) {
        keyphrase = ps_get_kws(ps, HOTWORD_KEY);
        E_INFO("keyphrase is:  %s \n", keyphrase);
    }

    const char *hyp;
    do {
        hyp = NULL;
        hyp = acquire_from_mic(ps, ad, FALSE);
        if (hyp != NULL){
            if (strcmp(keyphrase, hyp) == 0) {
                return 1;
            }
        }       
    } while (1);

    return 0;

}


int main(int argc, char *argv[])
{
    ps_decoder_t *ps;
    cmd_ln_t *config;

    config = cmd_ln_parse_file_r(NULL, args_def, "pocketsphinx.conf", 1);
    if (config == NULL) {
        fprintf(stderr, "Failed to create config object, see log for details\n");
        return -1;
    }

    ps = ps_init(config);
    if (ps == NULL) {
        fprintf(stderr, "Failed to create recognizer, see log for details\n");
        return -1;
    }

    ps_set_lm_file(ps, LM_KEY, "0806.lm");
    ps_set_keyphrase(ps, HOTWORD_KEY,  "HELP");

    ad_rec_t* ad = open_recording_device(ps, config);
    if (ad == NULL) {
        fprintf(stderr, "Failed to open_recording_device\n");
        return -1;
    }

    while (true) {
        if (wait_for_hotword(ps, ad)==1)
        {
            fprintf(stderr, "\n\n****************\nGot hotword\n");
        }
    }

    ps_free(ps);
    cmd_ln_free_r(config);

    Console::WriteLine(L"Hello World");
    return 0;
}

The config file:

-dict 0806.dic
-kws_threshold 1e-40
-samprate 16000
-lm 0806.lm
-hmm model/en-us/en-us/

The output below (

  • bad detection: the hotword detected all the time - no one say the word "HELP"):
  • Don't understand why I get the ---> hyp = HELP HELP HELP HELP HELP

output:

INFO: pocketsphinx.c(152): Parsed model-specific feature parameters from model/en-us/en-us//feat.params
Current configuration:
[NAME]                  [DEFLT]         [VALUE]
-agc                    none            none
-agcthresh              2.0             2.000000e+000
-allphone
-allphone_ci            no              no
-alpha                  0.97            9.700000e-001
-ascale                 20.0            2.000000e+001
-aw                     1               1
-backtrace              no              no
-beam                   1e-48           1.000000e-048
-bestpath               yes             yes
-bestpathlw             9.5             9.500000e+000
-ceplen                 13              13
-cmn                    current         current
-cmninit                8.0             40,3,-1
-compallsen             no              no
-debug                                  0
-dict                                   0806.dic
-dictcase               no              no
-dither                 no              no
-doublebw               no              no
-ds                     1               1
-fdict
-feat                   1s_c_d_dd       1s_c_d_dd
-featparams
-fillprob               1e-8            1.000000e-008
-frate                  100             100
-fsg
-fsgusealtpron          yes             yes
-fsgusefiller           yes             yes
-fwdflat                yes             yes
-fwdflatbeam            1e-64           1.000000e-064
-fwdflatefwid           4               4
-fwdflatlw              8.5             8.500000e+000
-fwdflatsfwin           25              25
-fwdflatwbeam           7e-29           7.000000e-029
-fwdtree                yes             yes
-hmm                                    model/en-us/en-us/
-input_endian           little          little
-jsgf
-keyphrase
-kws
-kws_delay              10              10
-kws_plp                1e-1            1.000000e-001
-kws_threshold          1               1.000000e-040
-latsize                5000            5000
-lda
-ldadim                 0               0
-lifter                 0               22
-lm                                     0806.lm
-lmctl
-lmname
-logbase                1.0001          1.000100e+000
-logfn
-logspec                no              no
-lowerf                 133.33334       1.300000e+002
-lpbeam                 1e-40           1.000000e-040
-lponlybeam             7e-29           7.000000e-029
-lw                     6.5             6.500000e+000
-maxhmmpf               30000           30000
-maxwpf                 -1              -1
-mdef
-mean
-mfclogdir
-min_endfr              0               0
-mixw
-mixwfloor              0.0000001       1.000000e-007
-mllr
-mmap                   yes             yes
-ncep                   13              13
-nfft                   512             512
-nfilt                  40              25
-nwpen                  1.0             1.000000e+000
-pbeam                  1e-48           1.000000e-048
-pip                    1.0             1.000000e+000
-pl_beam                1e-10           1.000000e-010
-pl_pbeam               1e-10           1.000000e-010
-pl_pip                 1.0             1.000000e+000
-pl_weight              3.0             3.000000e+000
-pl_window              5               5
-rawlogdir
-remove_dc              no              no
-remove_noise           yes             yes
-remove_silence         yes             yes
-round_filters          yes             yes
-samprate               16000           1.600000e+004
-seed                   -1              -1
-sendump
-senlogdir
-senmgau
-silprob                0.005           5.000000e-003
-smoothspec             no              no
-svspec                                 0-12/13-25/26-38
-tmat
-tmatfloor              0.0001          1.000000e-004
-topn                   4               4
-topn_beam              0               0
-toprule
-transform              legacy          dct
-unit_area              yes             yes
-upperf                 6855.4976       6.800000e+003
-uw                     1.0             1.000000e+000
-vad_postspeech         50              50
-vad_prespeech          20              20
-vad_startspeech        10              10
-vad_threshold          2.0             3.000000e+000
-var
-varfloor               0.0001          1.000000e-004
-varnorm                no              no
-verbose                no              no
-warp_params
-warp_type              inverse_linear  inverse_linear
-wbeam                  7e-29           7.000000e-029
-wip                    0.65            6.500000e-001
-wlen                   0.025625        2.562500e-002

INFO: feat.c(715): Initializing feature stream to type: '1s_c_d_dd', ceplen=13, CMN='current', VARNORM='no', AGC='none'
INFO: cmn.c(143): mean[0]= 12.00, mean[1..12]= 0.0
INFO: acmod.c(164): Using subvector specification 0-12/13-25/26-38
INFO: mdef.c(518): Reading model definition: model/en-us/en-us//mdef
INFO: mdef.c(531): Found byte-order mark BMDF, assuming this is a binary mdef file
INFO: bin_mdef.c(336): Reading binary model definition: model/en-us/en-us//mdef
INFO: bin_mdef.c(516): 42 CI-phone, 137053 CD-phone, 3 emitstate/phone, 126 CI-sen, 5126 Sen, 29324 Sen-Seq
INFO: tmat.c(206): Reading HMM transition probability matrices: model/en-us/en-us//transition_matrices
INFO: acmod.c(117): Attempting to use PTM computation module
INFO: ms_gauden.c(198): Reading mixture gaussian parameter: model/en-us/en-us//means
INFO: ms_gauden.c(292): 42 codebook, 3 feature, size:
INFO: ms_gauden.c(294):  128x13
INFO: ms_gauden.c(294):  128x13
INFO: ms_gauden.c(294):  128x13
INFO: ms_gauden.c(198): Reading mixture gaussian parameter: model/en-us/en-us//variances
INFO: ms_gauden.c(292): 42 codebook, 3 feature, size:
INFO: ms_gauden.c(294):  128x13
INFO: ms_gauden.c(294):  128x13
INFO: ms_gauden.c(294):  128x13
INFO: ms_gauden.c(354): 222 variance values floored
INFO: ptm_mgau.c(476): Loading senones from dump file model/en-us/en-us//sendump
INFO: ptm_mgau.c(500): BEGIN FILE FORMAT DESCRIPTION
INFO: ptm_mgau.c(563): Rows: 128, Columns: 5126
INFO: ptm_mgau.c(595): Using memory-mapped I/O for senones
INFO: ptm_mgau.c(835): Maximum top-N: 4
INFO: phone_loop_search.c(114): State beam -225 Phone exit beam -225 Insertion penalty 0
INFO: dict.c(320): Allocating 4104 * 20 bytes (80 KiB) for word entries
INFO: dict.c(333): Reading main dictionary: 0806.dic
INFO: dict.c(213): Allocated 0 KiB for strings, 0 KiB for phones
INFO: dict.c(336): 3 words read
INFO: dict.c(358): Reading filler dictionary: model/en-us/en-us//noisedict
INFO: dict.c(213): Allocated 0 KiB for strings, 0 KiB for phones
INFO: dict.c(361): 5 words read
INFO: dict2pid.c(396): Building PID tables for dictionary
INFO: dict2pid.c(406): Allocating 42^3 * 2 bytes (144 KiB) for word-initial triphones
INFO: dict2pid.c(132): Allocated 21336 bytes (20 KiB) for word-final triphones
INFO: dict2pid.c(196): Allocated 21336 bytes (20 KiB) for single-phone word triphones
INFO: ngram_model_trie.c(347): Trying to read LM in trie binary format
INFO: ngram_model_trie.c(358): Header doesn't match
INFO: ngram_model_trie.c(176): Trying to read LM in arpa format
INFO: ngram_model_trie.c(192): LM of order 3
INFO: ngram_model_trie.c(194): #1-grams: 5
INFO: ngram_model_trie.c(194): #2-grams: 6
INFO: ngram_model_trie.c(194): #3-grams: 3
INFO: lm_trie.c(473): Training quantizer
INFO: lm_trie.c(481): Building LM trie
INFO: ngram_search_fwdtree.c(99): 3 unique initial diphones
INFO: ngram_search_fwdtree.c(148): 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(186): Creating search tree
INFO: ngram_search_fwdtree.c(192): before: 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(326): after: max nonroot chan increased to 139
INFO: ngram_search_fwdtree.c(339): after: 3 root, 11 non-root channels, 5 single-phone words
INFO: ngram_search_fwdflat.c(157): fwdflat: min_ef_width = 4, max_sf_win = 25
INFO: ngram_model_trie.c(347): Trying to read LM in trie binary format
INFO: ngram_model_trie.c(358): Header doesn't match
INFO: ngram_model_trie.c(176): Trying to read LM in arpa format
INFO: ngram_model_trie.c(192): LM of order 3
INFO: ngram_model_trie.c(194): #1-grams: 5
INFO: ngram_model_trie.c(194): #2-grams: 6
INFO: ngram_model_trie.c(194): #3-grams: 3
INFO: lm_trie.c(473): Training quantizer
INFO: lm_trie.c(481): Building LM trie
INFO: ngram_search_fwdtree.c(99): 3 unique initial diphones
INFO: ngram_search_fwdtree.c(148): 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(186): Creating search tree
INFO: ngram_search_fwdtree.c(192): before: 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(326): after: max nonroot chan increased to 139
INFO: ngram_search_fwdtree.c(339): after: 3 root, 11 non-root channels, 5 single-phone words
INFO: ngram_search_fwdflat.c(157): fwdflat: min_ef_width = 4, max_sf_win = 25
INFO: kws_search.c(420): KWS(beam: -1080, plp: -23, default threshold -900, delay 10)
ERROR: "cmd_ln.c", line 938: Unknown argument: -adcdev
Allocating 32 buffers of 2500 samples each
INFO: cppTest.cpp(103): keyphrase is:  HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 40.00  3.00 -1.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 >
INFO: cmn_prior.c(149): cmn_prior_update: to   < 38.01 18.02 -0.60 -3.94  3.44  2.67 -2.47 -0.60  2.43 -5.32 -2.22 -7.04  2.46 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 38.01 18.02 -0.60 -3.94  3.44  2.67 -2.47 -0.60  2.43 -5.32 -2.22 -7.04  2.46 >
INFO: cmn_prior.c(149): cmn_prior_update: to   < 39.34 14.99 -7.16 -2.53 -4.98 -8.98 -3.15  1.60  4.95 -9.76  2.29 -5.59  4.39 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 39.34 14.99 -7.16 -2.53 -4.98 -8.98 -3.15  1.60  4.95 -9.76  2.29 -5.59  4.39 >
INFO: cmn_prior.c(149): cmn_prior_update: to   < 39.29 13.66 -5.37 -1.40 -4.89 -9.12 -5.23  0.36  4.09 -10.30  3.82 -4.18  4.00 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(99): cmn_prior_update: from < 39.29 13.66 -5.37 -1.40 -4.89 -9.12 -5.23  0.36  4.09 -10.30  3.82 -4.18  4.00 >
INFO: cmn_prior.c(116): cmn_prior_update: to   < 39.24 13.36 -4.74 -0.99 -5.23 -9.40 -5.08 -0.40  3.95 -10.70  3.53 -4.19  3.83 >
INFO: cmn_prior.c(99): cmn_prior_update: from < 39.24 13.36 -4.74 -0.99 -5.23 -9.40 -5.08 -0.40  3.95 -10.70  3.53 -4.19  3.83 >
INFO: cmn_prior.c(116): cmn_prior_update: to   < 39.12 11.92 -5.07 -1.94 -6.63 -7.68 -3.63 -2.29  0.49 -11.79  3.25 -3.04  2.44 >
INFO: cmn_prior.c(131): cmn_prior_update: from < 39.12 11.92 -5.07 -1.94 -6.63 -7.68 -3.63 -2.29  0.49 -11.79  3.25 -3.04  2.44 >
INFO: cmn_prior.c(149): cmn_prior_update: to   < 38.03 11.47 -5.09 -2.07 -6.22 -6.86 -3.23 -2.30  0.27 -11.47  2.86 -3.17  2.37 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(99): cmn_prior_update: from < 38.03 11.47 -5.09 -2.07 -6.22 -6.86 -3.23 -2.30  0.27 -11.47  2.86 -3.17  2.37 >
INFO: cmn_prior.c(116): cmn_prior_update: to   < 37.17 10.53 -3.43  1.18 -7.71 -8.15 -2.56 -5.45 -0.81 -12.44  0.63 -2.53  2.86 >
INFO: cmn_prior.c(131): cmn_prior_update: from < 37.17 10.53 -3.43  1.18 -7.71 -8.15 -2.56 -5.45 -0.81 -12.44  0.63 -2.53  2.86 >
INFO: cmn_prior.c(149): cmn_prior_update: to   < 36.30 10.14 -2.66  2.23 -7.40 -7.74 -2.55 -5.77 -0.57 -11.16  0.96 -2.47  2.95 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 36.30 10.14 -2.66  2.23 -7.40 -7.74 -2.55 -5.77 -0.57 -11.16  0.96 -2.47  2.95 >
INFO: cmn_prior.c(149): cmn_prior_update: to   < 36.39 10.85 -2.80  2.29 -5.72 -6.97 -2.79 -3.21 -0.31 -10.98  0.31 -3.93  2.68 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp   = HELP


****************
Got hotword

INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
0

There are 0 best solutions below