程式師世界 >> 編程語言 >> C語言 >> C++ >> C++入門知識 >> C++實現語音識別詞典內存存儲模型

C++實現語音識別詞典內存存儲模型

編輯：C++入門知識

對於給定的詞典，如下 [html] 一 ii i1 一一 ii i1 ii i1 一一一 ii i1 ii i1 ii i1 一一一一 ii i4 ii i1 ii i4 ii i1 一一一七 ii i1 ii i1 ii i1 q i1 一一一三 ii i1 ii i1 ii i1 s an1 一一一九 ii i1 ii i1 ii i1 j iu3 一一一二 ii i1 ii i1 ii i1 ee er4 一一一五 ii i1 ii i1 ii i1 uu u3 一一一八 ii i1 ii i1 ii i1 b a1 一一一六 ii i1 ii i1 ii i1 l iu4 一一一四 ii i1 ii i1 ii i1 s iy4 一一一零 ii i1 ii i1 ii i1 l ing2 一一七 ii i1 ii i1 q i1 此處略去很多個字在語音識別時，訓練和解碼時都要用到詞典，本文說明一個詞典處理時的內存存儲模型。一，模型需求模型包括如下信息詞典共有多少個詞，這包括正常的詞和特殊的詞。句子或文法的起始和結束用詞。停頓詞，就是靜音詞，用於靜音建模。詞的發音個數。可能有些詞有多個發音，如“和”，就是一個常見的多音詞。建立語言模型時，是否依賴詞。二，模型實現 [cpp] class Vocabulary { public: int nWords ; // 總共多少個詞 char **words ; // 所有詞的數組，下標為詞的序數，包括正常詞和特殊詞 int nNormWords ; // 正常詞個數 int *normWordInds ; 正常詞的序數 char specWordChar ; // 特殊詞的標識，就是標識一個特殊詞，如!一,表示一為特殊詞 int nSpecWords ; // 特殊詞的個數 int *specWordInds ; //所有特殊詞的序數數組 int sentStartIndex ; // 句子或文法開始的詞序號 int sentEndIndex ;// 句子或文法結束的詞序號 int silIndex ;//靜音詞 bool fromBinFile ; // 構造函數 Vocabulary() ; Vocabulary(const char *lexFName , char specWordChar_='\0' , const char *sentStartWord=NULL , const char *sentEndWord=NULL , const char *silWord=NULL) ; virtual ~DecVocabulary() ; char *getWord( int index ) ; // 根據給定的序號，獲取詞，這個從詞數組中獲取。 int getNumPronuns( int index ) ; // 根據給定的序號，獲取該詞的發音個數，如果不是多音詞，就返回1。 bool isSpecial( int index ) ; //序號對應的詞是否是特殊詞 bool getIgnoreLM( int index ); // 標記是否用於語言模型建模，一般都是依賴的，這個提高識別率。 int getIndex( const char *word , int guess=-1 ) ;//根據詞獲取序號，可以指定起始位置開始查找 private: int nWordsAlloc ; // nWords記錄詞典包含多少個詞，這個值記錄共為內存詞典大小。 bool *special ; // 指示詞典中的詞是否為特殊詞 int *nPronuns ; // 每個詞對應多少種不同的發音 /*** 添加一個詞至內存詞典中 ,並指示是否需要更新發音**/ int addWord( const char *word , bool registerPronun=true ) ; }; 三，構造函數過程打開構造參數的詞典文件，參數名lexFName，FILE *fd。調用while( fgets(line,1000,fd)!=NULL )從fd中一行一行讀取，然後分割取第一個域，並調用成員函數addWord往詞典中加入詞。將開始和結束詞加入詞典中。根據specWordChar來處理特殊詞，就是判斷第一個字節是否為specWordChar。統計特殊詞和正常詞的個數，並存入相應的內存中（見上文的類定義)。四，加詞實現代碼如下： [cpp] int Vocabulary::addWord( const char *word , bool registerPronun ) { int cmpResult=0 , ind=-1 ; //分配足夠的空間存儲 if ( nWords == nWordsAlloc ){ nWordsAlloc += 100 ; words = (char **)realloc( words , nWordsAlloc*sizeof(char *) ) ; nPronuns = (int *)realloc( nPronuns , nWordsAlloc*sizeof(int) ) ; for ( int i=nWords ; i<nWordsAlloc ; i++ ){ words[i] = NULL ; nPronuns[i] = 0 ; } } if ( (word == NULL) || (word[0] == '\0') ) return -1 ; if ( nWords > 0 ) cmpResult = strcasecmp( words[nWords-1] , word ) ; //確保新詞在適當的位置 if ( (cmpResult < 0) || (nWords == 0) ){ // The new word belongs at the end of the list words[nWords] = new char[strlen(word)+1] ; nPronuns[nWords] = 0 ; strcpy( words[nWords] , word ) ; ind = nWords ; nWords++ ; }else if ( cmpResult > 0 ){ for ( int i=0 ; i<nWords ; i++ ){ cmpResult = strcasecmp( words[i] , word ) ; if ( cmpResult > 0 ){ nWords++ ; for ( int j=(nWords-1) ; j>i ; j-- ){ words[j] = words[j-1] ; nPronuns[j] = nPronuns[j-1] ; } words[i] = new char[strlen(word)+1] ; strcpy( words[i] , word ) ; nPronuns[i] = 0 ; ind = i ; break ; }else if ( cmpResult == 0 ){ //詞已經存在 ind = i ; break ; } } } if ( ind < 0 ) error("添加詞失敗 < 0") ; if ( registerPronun ){ (nPronuns[ind])++ ; //注冊詞的發音 } return ind ; //返回序號 }