程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> C語言 >> C++ >> C++入門知識 >> 萌新筆記——C++裡創建 Trie字典樹(中文詞典)(三)(聯想),trie字典

萌新筆記——C++裡創建 Trie字典樹(中文詞典)(三)(聯想),trie字典

編輯:C++入門知識

萌新筆記——C++裡創建 Trie字典樹(中文詞典)(三)(聯想),trie字典


  萌新做詞典第三篇,做得不好,還請指正,謝謝大佬!

  今天把詞典的聯想做好了,也是比較low的,還改了之前的查詢、遍歷等代碼。  Orz

  一樣地先放上運行結果:

 1 test1
 2 ID : 2    char : 件    word : 編程軟件
 3 ID : 3    char : 習    word : 編程學習
 4 ID : 4    char : 站    word : 編程學習網站
 5 ID : 1    char : 門    word : 編程入門
 6 
 7 test2
 8 ID : 5    char : 練    word : 編程訓練
 9 ID : 1    char : 門    word : 編程入門
10 ID : 3    char : 習    word : 編程學習
11 ID : 4    char : 站    word : 編程學習網站
12 ID : 2    char : 件    word : 編程軟件
13 find ID : 3    word : 編程學習
14 
15 associate "編程" : 
16 find!
17 訓練
18 入門
19 學習
20 學習網站
21 軟件

  測試用的test.cc

 1 #include "Dictionary.h"
 2 #include <iostream>
 3 #include <string>
 4 #include <vector>
 5 using std::cout;
 6 using std::endl;
 7 using std::string;
 8 using std::vector;
 9 
10 int test1()
11 {
12     ccx::Dictionary words;
13     string word1 = "編程入門";    
14     string word2 = "編程軟件";    
15     string word3 = "編程學習";    
16     string word4 = "編程學習網站";    
17     
18     words.push(word1);    
19     words.push(word2);    
20     words.push(word3);    
21     words.push(word4);    
22 
23     words.resetIt();
24     
25     while(!words.isEnd())
26     {
27         cout << "ID : " << words.getCurWordId() 
28             << "\tchar : " << words.getCurChar() 
29             << "\tword : " << words.getCurWord() << endl;
30         words.next();
31     }
32     
33     words.leading_out();
34     return 0;
35 }
36 
37 
38 int test2()
39 {
40     ccx::Dictionary words;
41     words.leading_in();
42 
43 
44     string word("編程訓練");
45     words.push(word);
46     words.resetIt();
47 
48     while(!words.isEnd())
49     {
50         cout << "ID : " << words.getCurWordId() 
51             << "\tchar : " << words.getCurChar() 
52             << "\tword : " << words.getCurWord() << endl;
53         words.next();
54     }
55     string tmp = "編程學習";    
56     int id = words.search(tmp);
57     if(-1 == id)
58     {
59         cout << "no such word like \"" << tmp << "\"" << endl;
60     }else{
61         cout << "find ID : " << id
62             << "\tword : " << tmp << endl;
63     }
64 
65     cout << endl;
66     cout << "associate \"編程\" : " << endl;
67 
68     vector<string> data;    
69     string temp = "編程";
70     
71     if(words.associate(temp, data))
72     {
73         cout << "find!" << endl;
74         for(auto & elem : data)
75         {
76             cout << elem << endl;
77         }
78     }else{
79         cout << "can't find" << endl;
80     }
81 
82 
83     return 0;
84 }
85 
86 int main()
87 {
88     cout << "test1" << endl;
89     test1();
90     cout << endl;
91     cout << "test2" << endl;
92     test2();
93     cout << endl;
94 }
View Code

 

  test1不變,test2 在導入後再插入一個詞“編程訓練”,發現ID是正常的。

  然後在test2最後調用聯想函數,傳入“編程”,能夠正常傳出所有的字符串。

 

  在做這個的時候,一開始想的很簡單,就是拿傳入的詞去樹中查找,找到最後一人字對應的節點,然後以那個節點為根進行遍歷。然後就開開心心地去寫了,結果寫一部分就要對之前的代碼進行更改,於是,這個接口越來越“肥”了:

 

Dictionary.h

 1 #ifndef __DICTIONARY_H__
 2 #define __DICTIONARY_H__
 3 
 4 #include "DictionaryData.h"
 5 #include "DictionaryConf.h"
 6 
 7 #include <memory>
 8 #include <vector>
 9 #include <list>
10 
11 namespace ccx{
12 
13 using std::shared_ptr;
14 using std::vector;
15 using std::list;
16 
17 class Dictionary
18 {
19     typedef unordered_map<string, pDictElem>::iterator WordIt;
20     public:
21         Dictionary();
22         void push(const string & word);
23         void push(vector<string> & words);
24         int search(const string & word);
25         bool associate(const string & word, vector<string> & data);
26     private:
27         void AddWord(const string & word, int wordId);
28         void splitWord(const string & word, vector<string> & characters);//把詞拆成字
29         int search(vector<string> & data, pDictElem & pcur);
30         pDictElem _dictionary;
31         DictionaryConf _conf;    
32 
33 //遍歷
34     public:
35         string getCurChar();
36         string getCurWord();
37         int getCurWordId();
38         bool isEnd();
39         void resetIt();
40         void next();
41     private:
42         void resetPoint(pDictElem pcur);
43         void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
44         void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
45         string getCurWord(list<WordIt> & stackWord);
46         
47         pDictElem _pcur;
48         WordIt _itcur;
49         
50 //用list實現棧,遍歷時方便
51         list<WordIt> _stackWord;
52         list<pDictElem> _stackDict;
53 
54 //導入導出
55     public:
56         void leading_in();
57         void leading_out();
58 };
59 
60 }
61 
62 #endif

 

  對幾個原有的函數進行了重載,主要是為了能夠復用一些代碼,但是又想不到合適的新的函數名(英語不太好Orz)。

 

  首先,是要能夠查找並返回新的根結點,於是對search進行修改:

 1 int Dictionary::search(vector<string> & characters, pDictElem & root)
 2 {
 3     vector<string>::iterator it_char;
 4     it_char = characters.begin();    
 5     root = _dictionary;
 6     int i = 0;
 7     for(; it_char != characters.end(); ++it_char, ++i)
 8     {
 9         WordIt it_word;
10         it_word = root->_words.find(*it_char);
11         
12         if(it_word == root->_words.end())
13         {
14             break;
15         }else{
16             root = it_word->second;
17         }
18     }
19     return i;
20 }

  形參第一項是分解後的字集,第二項是一個智能指針,指向某個節點。這裡返回值改為了字集的第幾項,有兩個目的:

  1、插入函數中可以方便地知道下一個要插入的是哪個字符

  2、聯想函數中可以判斷字集中的字是否都存在於詞典中

  3、好吧,我沒想到其它好辦法,而且當時是想到上面兩點就這麼做了,後來發現,插入部分的代碼根本就不用改

 

  然後是重載search:

 1 int Dictionary::search(const string & word)
 2 {
 3     pDictElem root = _dictionary;
 4     vector<string> temp;
 5     splitWord(word, temp);
 6     
 7     int ret = search(temp, root);
 8     int size = temp.size();
 9     if(ret != size)
10     {
11         return -1;
12     }
13     return root->_wordId;
14 }

  在這裡對字進行分解,並定義一個臨時的根結點,這樣做的目的是為了保護private中的根結點,並且可以在多線程環境中互不干擾。

 

  能夠找到“新的根”後,就要對它進行遍歷了。如果只有單一線程或進程來使用它,這裡可以直接把resetPoint(原來的)修改一下,設置指定結點就可以了:

 1 void Dictionary::resetPoint(pDictElem pcur)
 2 {
 3     _pcur = pcur;
 4     if(_stackDict.size())
 5     {
 6         _stackDict.clear();
 7     }
 8     if(_stackWord.size())
 9     {
10         _stackWord.clear();
11     }
12     next();
13 }

 

  如果是這樣,那前面也完全不用修改。由於這個詞典最後是要應用到miniSearchEngin中,於是我對遍歷部分的函數進行了修改:

 1 void Dictionary::next()
 2 {
 3     next(_pcur, _stackWord, _stackDict);
 4 }
 5 
 6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 7 {
 8     while(pcur)
 9     {
10         nextWord(pcur, stackWord, stackDict);
11         if(!pcur || pcur->_wordId)
12         {
13             break;
14         }
15     }
16 }
17 
18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
19 {
20     if(pcur)
21     {
22         if(pcur->_words.size())
23         {
24             stackDict.push_back(pcur);
25             stackWord.push_back(pcur->_words.begin());
26             pcur = stackWord.back()->second;
27         }else{
28             ++(stackWord.back());
29         }
30         while(stackWord.back() == stackDict.back()->_words.end())
31         {
32             stackDict.pop_back();
33             stackWord.pop_back();
34             if(!stackDict.size())
35             {
36                 pcur = NULL;
37             }
38             ++(stackWord.back());
39         }
40         if(pcur)
41         {
42             pcur = stackWord.back()->second;
43         }    
44     }
45 }

  next部分,改為傳入參數,這樣就可以在associate裡定義臨時的棧和智能指針等,遍歷的時候與其它工作並沒有任何關系。

 

  同樣地,getWord也要做相同的更改:

 1 string Dictionary::getCurWord()
 2 {
 3     return getCurWord(_stackWord);
 4 }
 5 
 6 string Dictionary::getCurWord(list<WordIt> & stackWord)
 7 {
 8     string temp;
 9     list<WordIt>::iterator it_word;    
10     it_word = stackWord.begin();    
11 
12     for(; it_word != stackWord.end(); ++it_word)
13     {
14         temp += (*it_word)->first;
15     }
16     return temp;
17 }

 

  當然了,對外提供的接口都是不要傳參的,其它的只能在內部使用,於是放入了private區。

 

  終於可以開始寫聯想了0.0

 1 bool Dictionary::associate(const string & word, vector<string> & data)
 2 {
 3     pDictElem root = _dictionary;
 4     vector<string> temp;
 5     splitWord(word, temp);
 6     
 7     int ret = search(temp, root);
 8     int size = temp.size();
 9     if(ret != size)
10     {
11         return false;
12     }
13     
14     list<WordIt> stackWord;
15     list<pDictElem> stackDict;
16     next(root, stackWord, stackDict);
17     while(root)
18     {
19         string temp = getCurWord(stackWord);
20         data.push_back(temp);    
21         next(root, stackWord, stackDict);
22     }
23     
24     if(!data.size())
25     {
26         return false;
27     }
28     return true;
29 }

  返回bool類型,可以方便地判斷是否聯想成功,即以傳入的詞做為前綴,能否找到剩余部分(詞典裡有存)。於是乎,一個渣渣型號的詞典就做好啦~~~

 

Dictionary.cc

  1 #include "Dictionary.h"
  2 #include <iostream>
  3 #include <fstream>
  4 #include <string>
  5 #include <json/json.h>
  6 
  7 namespace ccx{
  8 
  9 using std::endl;
 10 using std::cout;
 11 using std::pair;
 12 using std::ofstream;
 13 using std::ifstream;
 14 
 15 Dictionary::Dictionary()
 16 : _dictionary(new DictElem)
 17 , _conf()
 18 {
 19     _dictionary->_wordId = 0;
 20     _pcur = _dictionary;
 21 }
 22 
 23 void Dictionary::splitWord(const string & word, vector<string> & characters)
 24 {
 25     int num = word.size();
 26     int i = 0;
 27     while(i < num)
 28     {
 29         int size = 1;
 30         if(word[i] & 0x80)
 31         {
 32             char temp = word[i];
 33             temp <<= 1;
 34             do{
 35                 temp <<= 1;
 36                 ++size;
 37             }while(temp & 0x80);
 38         }
 39         string subWord;
 40         subWord = word.substr(i, size);
 41         characters.push_back(subWord);
 42         i += size;
 43     }
 44 }
 45 
 46 void Dictionary::AddWord(const string & word, int wordId)
 47 {
 48     vector<string> characters;
 49     splitWord(word, characters);
 50     
 51     vector<string>::iterator it_char;
 52     it_char = characters.begin();    
 53     pDictElem root;
 54     root = _dictionary;
 55     for(; it_char != characters.end(); ++it_char)
 56     {
 57         WordIt it_word;
 58         it_word = root->_words.find(*it_char);
 59         
 60         if(it_word == root->_words.end())
 61         {
 62             pair<string, pDictElem> temp;
 63             temp.first = *it_char;
 64             pDictElem dictemp(new DictElem);
 65             dictemp->_word = *it_char;
 66             dictemp->_wordId = 0;
 67             temp.second = dictemp;
 68             root->_words.insert(temp);
 69             root = dictemp;
 70         }else{
 71             root = it_word->second;
 72         }
 73     }
 74     if(!root->_wordId)
 75     {
 76         root->_wordId = wordId;
 77     }
 78 }
 79 
 80 void Dictionary::push(const string & word)
 81 {
 82     ++(_dictionary->_wordId);
 83     AddWord(word, _dictionary->_wordId);
 84 }
 85 
 86 void Dictionary::push(vector<string> & words)
 87 {
 88     int size = words.size();
 89     for(int i = 0; i < size; ++i)
 90     {
 91         push(words[i]);
 92     }
 93 }
 94 
 95 int Dictionary::search(const string & word)
 96 {
 97     pDictElem root = _dictionary;
 98     vector<string> temp;
 99     splitWord(word, temp);
100     
101     int ret = search(temp, root);
102     int size = temp.size();
103     if(ret != size)
104     {
105         return -1;
106     }
107     return root->_wordId;
108 }
109 
110 int Dictionary::search(vector<string> & characters, pDictElem & root)
111 {
112     vector<string>::iterator it_char;
113     it_char = characters.begin();    
114     root = _dictionary;
115     int i = 0;
116     for(; it_char != characters.end(); ++it_char, ++i)
117     {
118         WordIt it_word;
119         it_word = root->_words.find(*it_char);
120         
121         if(it_word == root->_words.end())
122         {
123             break;
124         }else{
125             root = it_word->second;
126         }
127     }
128     return i;
129 }
130 
131 bool Dictionary::associate(const string & word, vector<string> & data)
132 {
133     pDictElem root = _dictionary;
134     vector<string> temp;
135     splitWord(word, temp);
136     
137     int ret = search(temp, root);
138     int size = temp.size();
139     if(ret != size)
140     {
141         return false;
142     }
143     
144     list<WordIt> stackWord;
145     list<pDictElem> stackDict;
146     next(root, stackWord, stackDict);
147     while(root)
148     {
149         string temp = getCurWord(stackWord);
150         data.push_back(temp);    
151         next(root, stackWord, stackDict);
152     }
153     
154     if(!data.size())
155     {
156         return false;
157     }
158     return true;
159 }
160 
161 //遍歷用
162 
163 void Dictionary::resetPoint(pDictElem pcur)
164 {
165     _pcur = pcur;
166     if(_stackDict.size())
167     {
168         _stackDict.clear();
169     }
170     if(_stackWord.size())
171     {
172         _stackWord.clear();
173     }
174     next();
175 }
176 
177 void Dictionary::resetIt()
178 {
179     resetPoint(_dictionary);
180 }
181 
182 void Dictionary::next()
183 {
184     next(_pcur, _stackWord, _stackDict);
185 }
186 
187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
188 {
189     while(pcur)
190     {
191         nextWord(pcur, stackWord, stackDict);
192         if(!pcur || pcur->_wordId)
193         {
194             break;
195         }
196     }
197 }
198 
199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
200 {
201     if(pcur)
202     {
203         if(pcur->_words.size())
204         {
205             stackDict.push_back(pcur);
206             stackWord.push_back(pcur->_words.begin());
207             pcur = stackWord.back()->second;
208         }else{
209             ++(stackWord.back());
210         }
211         while(stackWord.back() == stackDict.back()->_words.end())
212         {
213             stackDict.pop_back();
214             stackWord.pop_back();
215             if(!stackDict.size())
216             {
217                 pcur = NULL;
218             }
219             ++(stackWord.back());
220         }
221         if(pcur)
222         {
223             pcur = stackWord.back()->second;
224         }    
225     }
226 }
227 
228 string Dictionary::getCurChar()
229 {
230     return _pcur->_word;
231 }
232 
233 int Dictionary::getCurWordId()
234 {
235     return _pcur->_wordId;
236 }
237 
238 string Dictionary::getCurWord()
239 {
240     return getCurWord(_stackWord);
241 }
242 
243 string Dictionary::getCurWord(list<WordIt> & stackWord)
244 {
245     string temp;
246     list<WordIt>::iterator it_word;    
247     it_word = stackWord.begin();    
248 
249     for(; it_word != stackWord.end(); ++it_word)
250     {
251         temp += (*it_word)->first;
252     }
253     return temp;
254 }
255 
256 bool Dictionary::isEnd()
257 {
258     return _pcur == NULL;
259 }
260 
261 void Dictionary::leading_in()//導入,失敗沒必要退出程序
262 {
263     ifstream ifs;
264     const char * path = _conf.getDictionaryPath().c_str();
265     ifs.open(path);
266     if(!ifs.good())
267     {
268         cout << "open Dictionary.json error(leading_in)" << endl;
269     }else{
270         Json::Value root;
271         Json::Reader reader;
272         
273         if(!reader.parse(ifs, root, false))
274         {
275             cout << "json read Dictionary.json error" << endl;
276         }else{
277             int size = root.size();
278             for(int i = 0; i < size; ++i)
279             {
280                 string word = root[i]["Word"].asString();
281                 int wordId = root[i]["WordId"].asInt();
282                 AddWord(word, wordId);
283                 ++(_dictionary->_wordId);
284             }
285         }
286     }
287 }
288 
289 void Dictionary::leading_out()
290 {
291     Json::Value root;
292     Json::FastWriter writer;
293 
294     resetIt();
295     
296     while(!isEnd())
297     {
298         Json::Value elem;
299         elem["Word"] = getCurWord();
300         elem["WordId"] = getCurWordId();
301         root.append(elem);
302         next();
303     }
304 
305     string words;
306     words = writer.write(root);
307     
308     ofstream ofs;
309     const char * path = _conf.getDictionaryPath().c_str();
310     ofs.open(path);
311     if(!ofs.good())
312     {
313         cout << "open Dictionary.json error(leading_out)" << endl;
314         ofs.open("Dictionary.tmp");
315         if(!ofs.good())
316         {
317             exit(EXIT_FAILURE);
318         }
319     }
320     
321     ofs << words;
322     ofs.close();
323 }
324 
325 }
View Code

 

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved