最近在看Lucene.net 發現Lucene.net的中文分詞資料不是很多,很早就在看肖波的KTDictSeg,覺的分詞效果不錯,但是沒有lucene接口,看他的blog也是很長時間沒有更新了 他在他的blog中提到將在下一個版本中提供對lucene的支持,我這裡期待中...同時blog中提到一揮的修改版本,但是一揮的站打不開了,不知道什麼原因,我剛剛看這個時間不長,查了些資料 寫了下面的代碼實現了KTDictSeg在Lucene.net中的調用,期待有更好的方法出現
下面附上代碼
1using System; 2using System.Collections.Generic; 3using System.Text; 4using System.IO; 5using Lucene.Net; 6using Lucene.Net.Analysis; 7 8namespace Lucene.Net.Analysis.KTDictSeg 9{ 10 public class KTDictSegAnalyzer:Analyzer 11 { 12 public KTDictSegAnalyzer() 13 { 14 } 15 16 public override TokenStream TokenStream(string fieldName, TextReader reader) 17 { 18 TokenStream result = new KTDictSegTokenizer(reader); 19 result = new LowerCaseFilter(result); 20 return result; 21 } 22 } 23} using System; using System.Collections.Generic; using System.Text; using System.IO; using System.Collections; using Lucene.Net; using Lucene.Net.Analysis; using KTDictSeg; namespace Lucene.Net.Analysis.KTDictSeg { public class KTDictSegTokenizer:Tokenizer { public static CSimpleDictSeg m_SimpleDictSeg; private ArrayList ioBuffer; private int offSet = 0 ; //偏移量. private int position = -1 ; //詞匯在緩沖中的位置. private int length = 0 ; //詞匯的長度. private int start = 0 ; //開始偏移量. public KTDictSegTokenizer(System.IO.TextReader input) : base(input) { //這裡用了一個第三方的中文分詞組件. //ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd()); if (m_SimpleDictSeg == null) { try { m_SimpleDictSeg = new CSimpleDictSeg(); m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; m_SimpleDictSeg.LoadDict(); } catch (Exception e1) { m_SimpleDictSeg = null; throw e1; } } m_SimpleDictSeg.FilterStopWords = true; m_SimpleDictSeg.MatchName = true; ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd()); } //DotLucene的分詞器簡單來說,就是實現Tokenizer的Next方法,把分解出來的每一個詞構造為一個Token,因為Token是DotLucene分詞的基本單位。 public override Token Next() { position++; if (position < ioBuffer.Count) { length = ioBuffer[position].ToString().Length; start = offSet ; offSet += length ; return new Token(ioBuffer[position].ToString(), start, start + length); } return null; } } }
以上代碼借鑒了其他朋友的代碼,自己組織了下, 使用這個分詞,比使用lucene.net自帶的分詞StandardAnalyzer 速度上快了6倍
下面是制作索引的函數
private void mackIndex() { Analyzer analyzer = new KTDictSegAnalyzer(); //lucene.net 默認分詞器 //Analyzer analyzer = new StandardAnalyzer(); FSDirectory fsDir = FSDirectory.GetDirectory(Index_Store_Path, true); IndexWriter fswriter = new IndexWriter(fsDir, analyzer, true); ProductDao productDao = new ProductDao(); //得到數據源 IList<Product> PList = productDao.GetProduct(); IEnumerator<Product> _p = PList.GetEnumerator(); //根據數據源制定document while(_p.MoveNext()) { Document Doc = new Document(); Field prodname = new Field("prodname", _p.Current.Proname,Field.Store.YES,Field.Index.TOKENIZED); if (_p.Current.Proshuoming == null) { _p.Current.Proshuoming = "null"; } Field profunction = new Field("profunction", _p.Current.Proshuoming, Field.Store.YES, Field.Index.UN_TOKENIZED); Doc.Add(prodname); Doc.Add(profunction); fswriter.AddDocument(Doc); } fswriter.Close(); }
以上就是我的方法了 ,如果大家有什麼更好的辦法,麻煩介紹下 同時誰有雨痕v3免費版的分詞也發給我一份,先謝謝了