萌新筆記——C++裡創立 Trie字典樹(中文詞典)(三)(聯想)。本站提示廣大學習愛好者:(萌新筆記——C++裡創立 Trie字典樹(中文詞典)(三)(聯想))文章只能為提供參考,不一定能成為您想要的結果。以下是萌新筆記——C++裡創立 Trie字典樹(中文詞典)(三)(聯想)正文
萌新做詞典第三篇,做得不好,還請指正,謝謝大佬!
明天把詞典的聯想做好了,也是比擬low的,還改了之前的查詢、遍歷等代碼。 Orz
一樣地先放上運轉後果:
1 test1 2 ID : 2 char : 件 word : 編程軟件 3 ID : 3 char : 習 word : 編程學習 4 ID : 4 char : 站 word : 編程學習網站 5 ID : 1 char : 門 word : 編程入門 6 7 test2 8 ID : 5 char : 練 word : 編程訓練 9 ID : 1 char : 門 word : 編程入門 10 ID : 3 char : 習 word : 編程學習 11 ID : 4 char : 站 word : 編程學習網站 12 ID : 2 char : 件 word : 編程軟件 13 find ID : 3 word : 編程學習 14 15 associate "編程" : 16 find! 17 訓練 18 入門 19 學習 20 學習網站 21 軟件
測試用的test.cc
1 #include "Dictionary.h" 2 #include <iostream> 3 #include <string> 4 #include <vector> 5 using std::cout; 6 using std::endl; 7 using std::string; 8 using std::vector; 9 10 int test1() 11 { 12 ccx::Dictionary words; 13 string word1 = "編程入門"; 14 string word2 = "編程軟件"; 15 string word3 = "編程學習"; 16 string word4 = "編程學習網站"; 17 18 words.push(word1); 19 words.push(word2); 20 words.push(word3); 21 words.push(word4); 22 23 words.resetIt(); 24 25 while(!words.isEnd()) 26 { 27 cout << "ID : " << words.getCurWordId() 28 << "\tchar : " << words.getCurChar() 29 << "\tword : " << words.getCurWord() << endl; 30 words.next(); 31 } 32 33 words.leading_out(); 34 return 0; 35 } 36 37 38 int test2() 39 { 40 ccx::Dictionary words; 41 words.leading_in(); 42 43 44 string word("編程訓練"); 45 words.push(word); 46 words.resetIt(); 47 48 while(!words.isEnd()) 49 { 50 cout << "ID : " << words.getCurWordId() 51 << "\tchar : " << words.getCurChar() 52 << "\tword : " << words.getCurWord() << endl; 53 words.next(); 54 } 55 string tmp = "編程學習"; 56 int id = words.search(tmp); 57 if(-1 == id) 58 { 59 cout << "no such word like \"" << tmp << "\"" << endl; 60 }else{ 61 cout << "find ID : " << id 62 << "\tword : " << tmp << endl; 63 } 64 65 cout << endl; 66 cout << "associate \"編程\" : " << endl; 67 68 vector<string> data; 69 string temp = "編程"; 70 71 if(words.associate(temp, data)) 72 { 73 cout << "find!" << endl; 74 for(auto & elem : data) 75 { 76 cout << elem << endl; 77 } 78 }else{ 79 cout << "can't find" << endl; 80 } 81 82 83 return 0; 84 } 85 86 int main() 87 { 88 cout << "test1" << endl; 89 test1(); 90 cout << endl; 91 cout << "test2" << endl; 92 test2(); 93 cout << endl; 94 }View Code
test1不變,test2 在導入後再拔出一個詞“編程訓練”,發現ID是正常的。
然後在test2最後調用聯想函數,傳入“編程”,可以正常傳出一切的字符串。
在做這個的時分,一開端想的很復雜,就是拿傳入的詞去樹中查找,找到最後一人字對應的節點,然後以那個節點為根停止遍歷。然後就開開心心腸去寫了,後果寫一局部就要對之前的代碼停止更改,於是,這個接口越來越“肥”了:
Dictionary.h
1 #ifndef __DICTIONARY_H__ 2 #define __DICTIONARY_H__ 3 4 #include "DictionaryData.h" 5 #include "DictionaryConf.h" 6 7 #include <memory> 8 #include <vector> 9 #include <list> 10 11 namespace ccx{ 12 13 using std::shared_ptr; 14 using std::vector; 15 using std::list; 16 17 class Dictionary 18 { 19 typedef unordered_map<string, pDictElem>::iterator WordIt; 20 public: 21 Dictionary(); 22 void push(const string & word); 23 void push(vector<string> & words); 24 int search(const string & word); 25 bool associate(const string & word, vector<string> & data); 26 private: 27 void AddWord(const string & word, int wordId); 28 void splitWord(const string & word, vector<string> & characters);//把詞拆成字 29 int search(vector<string> & data, pDictElem & pcur); 30 pDictElem _dictionary; 31 DictionaryConf _conf; 32 33 //遍歷 34 public: 35 string getCurChar(); 36 string getCurWord(); 37 int getCurWordId(); 38 bool isEnd(); 39 void resetIt(); 40 void next(); 41 private: 42 void resetPoint(pDictElem pcur); 43 void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict); 44 void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict); 45 string getCurWord(list<WordIt> & stackWord); 46 47 pDictElem _pcur; 48 WordIt _itcur; 49 50 //用list完成棧,遍歷時方便 51 list<WordIt> _stackWord; 52 list<pDictElem> _stackDict; 53 54 //導入導出 55 public: 56 void leading_in(); 57 void leading_out(); 58 }; 59 60 } 61 62 #endif
對幾個原有的函數停止了重載,次要是為了可以復用一些代碼,但是又想不到適宜的新的函數名(英語不太好Orz)。
首先,是要可以查找並前往新的根結點,於是對search停止修正:
1 int Dictionary::search(vector<string> & characters, pDictElem & root) 2 { 3 vector<string>::iterator it_char; 4 it_char = characters.begin(); 5 root = _dictionary; 6 int i = 0; 7 for(; it_char != characters.end(); ++it_char, ++i) 8 { 9 WordIt it_word; 10 it_word = root->_words.find(*it_char); 11 12 if(it_word == root->_words.end()) 13 { 14 break; 15 }else{ 16 root = it_word->second; 17 } 18 } 19 return i; 20 }
形參第一項是分解後的字集,第二項是一個智能指針,指向某個節點。這裡前往值改為了字集的第幾項,有兩個目的:
1、拔出函數中可以方便地知道下一個要拔出的是哪個字符
2、聯想函數中可以判別字集中的字能否都存在於詞典中
3、好吧,我沒想到其它好方法,而且事先是想到下面兩點就這麼做了,後來發現,拔出局部的代碼基本就不必改
然後是重載search:
1 int Dictionary::search(const string & word) 2 { 3 pDictElem root = _dictionary; 4 vector<string> temp; 5 splitWord(word, temp); 6 7 int ret = search(temp, root); 8 int size = temp.size(); 9 if(ret != size) 10 { 11 return -1; 12 } 13 return root->_wordId; 14 }
在這裡對字停止分解,並定義一個暫時的根結點,這樣做的目的是為了維護private中的根結點,並且可以在多線程環境中互不攪擾。
可以找到“新的根”後,就要對它停止遍歷了。假如只要單一線程或進程來運用它,這裡可以直接把resetPoint(原來的)修正一下,設置指定結點就可以了:
1 void Dictionary::resetPoint(pDictElem pcur) 2 { 3 _pcur = pcur; 4 if(_stackDict.size()) 5 { 6 _stackDict.clear(); 7 } 8 if(_stackWord.size()) 9 { 10 _stackWord.clear(); 11 } 12 next(); 13 }
假如是這樣,那後面也完全不必修正。由於這個詞典最後是要使用到miniSearchEngin中,於是我對遍歷局部的函數停止了修正:
1 void Dictionary::next() 2 { 3 next(_pcur, _stackWord, _stackDict); 4 } 5 6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 7 { 8 while(pcur) 9 { 10 nextWord(pcur, stackWord, stackDict); 11 if(!pcur || pcur->_wordId) 12 { 13 break; 14 } 15 } 16 } 17 18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 19 { 20 if(pcur) 21 { 22 if(pcur->_words.size()) 23 { 24 stackDict.push_back(pcur); 25 stackWord.push_back(pcur->_words.begin()); 26 pcur = stackWord.back()->second; 27 }else{ 28 ++(stackWord.back()); 29 } 30 while(stackWord.back() == stackDict.back()->_words.end()) 31 { 32 stackDict.pop_back(); 33 stackWord.pop_back(); 34 if(!stackDict.size()) 35 { 36 pcur = NULL; 37 } 38 ++(stackWord.back()); 39 } 40 if(pcur) 41 { 42 pcur = stackWord.back()->second; 43 } 44 } 45 }
next局部,改為傳入參數,這樣就可以在associate裡定義暫時的棧和智能指針等,遍歷的時分與其它任務並沒有任何關系。
異樣地,getWord也要做相反的更改:
1 string Dictionary::getCurWord() 2 { 3 return getCurWord(_stackWord); 4 } 5 6 string Dictionary::getCurWord(list<WordIt> & stackWord) 7 { 8 string temp; 9 list<WordIt>::iterator it_word; 10 it_word = stackWord.begin(); 11 12 for(; it_word != stackWord.end(); ++it_word) 13 { 14 temp += (*it_word)->first; 15 } 16 return temp; 17 }
當然了,對外提供的接口都是不要傳參的,其它的只能在外部運用,於是放入了private區。
終於可以開端寫聯想了0.0
1 bool Dictionary::associate(const string & word, vector<string> & data) 2 { 3 pDictElem root = _dictionary; 4 vector<string> temp; 5 splitWord(word, temp); 6 7 int ret = search(temp, root); 8 int size = temp.size(); 9 if(ret != size) 10 { 11 return false; 12 } 13 14 list<WordIt> stackWord; 15 list<pDictElem> stackDict; 16 next(root, stackWord, stackDict); 17 while(root) 18 { 19 string temp = getCurWord(stackWord); 20 data.push_back(temp); 21 next(root, stackWord, stackDict); 22 } 23 24 if(!data.size()) 25 { 26 return false; 27 } 28 return true; 29 }
前往bool類型,可以方便地判別能否聯想成功,即以傳入的詞做為前綴,能否找到剩余局部(詞典裡有存)。於是乎,一個渣渣型號的詞典就做好啦~~~
Dictionary.cc
1 #include "Dictionary.h" 2 #include <iostream> 3 #include <fstream> 4 #include <string> 5 #include <json/json.h> 6 7 namespace ccx{ 8 9 using std::endl; 10 using std::cout; 11 using std::pair; 12 using std::ofstream; 13 using std::ifstream; 14 15 Dictionary::Dictionary() 16 : _dictionary(new DictElem) 17 , _conf() 18 { 19 _dictionary->_wordId = 0; 20 _pcur = _dictionary; 21 } 22 23 void Dictionary::splitWord(const string & word, vector<string> & characters) 24 { 25 int num = word.size(); 26 int i = 0; 27 while(i < num) 28 { 29 int size = 1; 30 if(word[i] & 0x80) 31 { 32 char temp = word[i]; 33 temp <<= 1; 34 do{ 35 temp <<= 1; 36 ++size; 37 }while(temp & 0x80); 38 } 39 string subWord; 40 subWord = word.substr(i, size); 41 characters.push_back(subWord); 42 i += size; 43 } 44 } 45 46 void Dictionary::AddWord(const string & word, int wordId) 47 { 48 vector<string> characters; 49 splitWord(word, characters); 50 51 vector<string>::iterator it_char; 52 it_char = characters.begin(); 53 pDictElem root; 54 root = _dictionary; 55 for(; it_char != characters.end(); ++it_char) 56 { 57 WordIt it_word; 58 it_word = root->_words.find(*it_char); 59 60 if(it_word == root->_words.end()) 61 { 62 pair<string, pDictElem> temp; 63 temp.first = *it_char; 64 pDictElem dictemp(new DictElem); 65 dictemp->_word = *it_char; 66 dictemp->_wordId = 0; 67 temp.second = dictemp; 68 root->_words.insert(temp); 69 root = dictemp; 70 }else{ 71 root = it_word->second; 72 } 73 } 74 if(!root->_wordId) 75 { 76 root->_wordId = wordId; 77 } 78 } 79 80 void Dictionary::push(const string & word) 81 { 82 ++(_dictionary->_wordId); 83 AddWord(word, _dictionary->_wordId); 84 } 85 86 void Dictionary::push(vector<string> & words) 87 { 88 int size = words.size(); 89 for(int i = 0; i < size; ++i) 90 { 91 push(words[i]); 92 } 93 } 94 95 int Dictionary::search(const string & word) 96 { 97 pDictElem root = _dictionary; 98 vector<string> temp; 99 splitWord(word, temp); 100 101 int ret = search(temp, root); 102 int size = temp.size(); 103 if(ret != size) 104 { 105 return -1; 106 } 107 return root->_wordId; 108 } 109 110 int Dictionary::search(vector<string> & characters, pDictElem & root) 111 { 112 vector<string>::iterator it_char; 113 it_char = characters.begin(); 114 root = _dictionary; 115 int i = 0; 116 for(; it_char != characters.end(); ++it_char, ++i) 117 { 118 WordIt it_word; 119 it_word = root->_words.find(*it_char); 120 121 if(it_word == root->_words.end()) 122 { 123 break; 124 }else{ 125 root = it_word->second; 126 } 127 } 128 return i; 129 } 130 131 bool Dictionary::associate(const string & word, vector<string> & data) 132 { 133 pDictElem root = _dictionary; 134 vector<string> temp; 135 splitWord(word, temp); 136 137 int ret = search(temp, root); 138 int size = temp.size(); 139 if(ret != size) 140 { 141 return false; 142 } 143 144 list<WordIt> stackWord; 145 list<pDictElem> stackDict; 146 next(root, stackWord, stackDict); 147 while(root) 148 { 149 string temp = getCurWord(stackWord); 150 data.push_back(temp); 151 next(root, stackWord, stackDict); 152 } 153 154 if(!data.size()) 155 { 156 return false; 157 } 158 return true; 159 } 160 161 //遍歷用 162 163 void Dictionary::resetPoint(pDictElem pcur) 164 { 165 _pcur = pcur; 166 if(_stackDict.size()) 167 { 168 _stackDict.clear(); 169 } 170 if(_stackWord.size()) 171 { 172 _stackWord.clear(); 173 } 174 next(); 175 } 176 177 void Dictionary::resetIt() 178 { 179 resetPoint(_dictionary); 180 } 181 182 void Dictionary::next() 183 { 184 next(_pcur, _stackWord, _stackDict); 185 } 186 187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 188 { 189 while(pcur) 190 { 191 nextWord(pcur, stackWord, stackDict); 192 if(!pcur || pcur->_wordId) 193 { 194 break; 195 } 196 } 197 } 198 199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict) 200 { 201 if(pcur) 202 { 203 if(pcur->_words.size()) 204 { 205 stackDict.push_back(pcur); 206 stackWord.push_back(pcur->_words.begin()); 207 pcur = stackWord.back()->second; 208 }else{ 209 ++(stackWord.back()); 210 } 211 while(stackWord.back() == stackDict.back()->_words.end()) 212 { 213 stackDict.pop_back(); 214 stackWord.pop_back(); 215 if(!stackDict.size()) 216 { 217 pcur = NULL; 218 } 219 ++(stackWord.back()); 220 } 221 if(pcur) 222 { 223 pcur = stackWord.back()->second; 224 } 225 } 226 } 227 228 string Dictionary::getCurChar() 229 { 230 return _pcur->_word; 231 } 232 233 int Dictionary::getCurWordId() 234 { 235 return _pcur->_wordId; 236 } 237 238 string Dictionary::getCurWord() 239 { 240 return getCurWord(_stackWord); 241 } 242 243 string Dictionary::getCurWord(list<WordIt> & stackWord) 244 { 245 string temp; 246 list<WordIt>::iterator it_word; 247 it_word = stackWord.begin(); 248 249 for(; it_word != stackWord.end(); ++it_word) 250 { 251 temp += (*it_word)->first; 252 } 253 return temp; 254 } 255 256 bool Dictionary::isEnd() 257 { 258 return _pcur == NULL; 259 } 260 261 void Dictionary::leading_in()//導入,失敗沒必要加入順序 262 { 263 ifstream ifs; 264 const char * path = _conf.getDictionaryPath().c_str(); 265 ifs.open(path); 266 if(!ifs.good()) 267 { 268 cout << "open Dictionary.json error(leading_in)" << endl; 269 }else{ 270 Json::Value root; 271 Json::Reader reader; 272 273 if(!reader.parse(ifs, root, false)) 274 { 275 cout << "json read Dictionary.json error" << endl; 276 }else{ 277 int size = root.size(); 278 for(int i = 0; i < size; ++i) 279 { 280 string word = root[i]["Word"].asString(); 281 int wordId = root[i]["WordId"].asInt(); 282 AddWord(word, wordId); 283 ++(_dictionary->_wordId); 284 } 285 } 286 } 287 } 288 289 void Dictionary::leading_out() 290 { 291 Json::Value root; 292 Json::FastWriter writer; 293 294 resetIt(); 295 296 while(!isEnd()) 297 { 298 Json::Value elem; 299 elem["Word"] = getCurWord(); 300 elem["WordId"] = getCurWordId(); 301 root.append(elem); 302 next(); 303 } 304 305 string words; 306 words = writer.write(root); 307 308 ofstream ofs; 309 const char * path = _conf.getDictionaryPath().c_str(); 310 ofs.open(path); 311 if(!ofs.good()) 312 { 313 cout << "open Dictionary.json error(leading_out)" << endl; 314 ofs.open("Dictionary.tmp"); 315 if(!ofs.good()) 316 { 317 exit(EXIT_FAILURE); 318 } 319 } 320 321 ofs << words; 322 ofs.close(); 323 } 324 325 }View Code