程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> C#基礎知識 >> 關於C#在lucene.net下的中文切詞

關於C#在lucene.net下的中文切詞

編輯：C#基礎知識

　　經過一天的研究，終於完成了C#在lucene.net下可以使用的中文切詞方法。感到有些復雜，不過我還是拿下了。頗有點成就感的，發上來跟大家分享一下!

　　在實現了中文切詞的基礎方法上，我將其封裝在繼承lucene的Analyzer類下

　　chineseAnalzer的方法就不用多說了。

以下是引用片段：
　　using System;
　　using System.Collections.Generic;
　　using System.Text;
　　
　　using Lucene.Net.Analysis;
　　using Lucene.Net.Analysis.Standard;
　　
　　namespace Lucene.Fanswo
　　{
　　 /**////
　　 ///
　　 ///
　　 public class ChineseAnalyzer:Analyzer
　　 {
　　 //private System.Collections.Hashtable stopSet;
　　 public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我們" };
　　
　　
　　 /**//// Constructs a {@link StandardTokenizer} filtered by a {@link
　　 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
　　 ///
　　 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
　　 {
　　 TokenStream result = new ChineseTokenizer(reader);
　　 result = new StandardFilter(result);
　　 result = new LowerCaseFilter(result);
　　 result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
　　 return result;
　　 }
　　
　　 }
　　}　　

　　ChineseTokenizer類的實現：

　　這裡通過詞典來正向匹配字符，返回lucene下定義的token流

以下是引用片段：
　　using System;
　　using System.Collections.Generic;
　　using System.Text;
　　using Lucene.Net.Analysis;
　　using System.Collections;
　　using System.Text.RegularExpressions;
　　using System.IO;
　　
　　namespace Lucene.Fanswo
　　{
　　 class ChineseTokenizer : Tokenizer
　　 {
　　
　　 private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量，當前字符的位置，字符長度
　　
　　 private int start;//開始位置
　　 /**////
　　 /// 存在字符內容
　　 ///
　　 private string text;
　　
　　 /**////
　　 /// 切詞所花費的時間
　　 ///
　　 public double TextSeg_Span = 0;
　　
　　 /**//// Constructs a tokenizer for this Reader.
　　 public ChineseTokenizer(System.IO.TextReader reader)
　　 {
　　 this.input = reader;
　　 text = input.ReadToEnd();
　　 dataLen = text.Length;
　　 }
　　
　　 /**//// 進行切詞，返回數據流中下一個token或者數據流為空時返回null
　　 ///
　　 ///
　　 public override Token Next()
　　 {
　　 Token token = null;
　　 WordTree tree = new WordTree();
　　 //讀取詞庫
　　 tree.LoadDict();
　　 //初始化詞庫，為樹形
　　 Hashtable t_chartable = WordTree.chartable;
　　 string ReWord = "";
　　 string char_s;
　　 start = offset;
　　 bufferIndex = start;
　　
　　 while (true)
　　 {
　　 //開始位置超過字符長度退出循環
　　 if (start >= dataLen)
　　 {
　　 break;
　　 }
　　 //獲取一個詞
　　 char_s = text.Substring(start, 1);
　　 if (string.IsNullOrEmpty(char_s.Trim()))
　　 {
　　 start++;
　　 continue;
　　 }
　　 //字符不在字典中
　　 if (!t_chartable.Contains(char_s))
　　 {
　　 if (ReWord == "")
　　 {
　　 int j = start + 1;
　　 switch (tree.GetCharType(char_s))
　　 {
　　 case 0://中文單詞
　　 ReWord += char_s;
　　 break;
　　 case 1://英文單詞
　　 j = start + 1;
　　 while (j < dataLen)
　　 {
　　 if (tree.GetCharType(text.Substring(j, 1)) != 1)
　　 break;
　　
　　 j++;
　　 }
　　 ReWord += text.Substring(start, j - offset);
　　
　　 break;
　　 case 2://數字
　　 j = start + 1;
　　 while (j < dataLen)
　　 {
　　 if (tree.GetCharType(text.Substring(j, 1)) != 2)
　　 break;
　　
　　 j++;
　　 }
　　 ReWord += text.Substring(start, j - offset);
　　
　　 break;
　　
　　 default:
　　 ReWord += char_s;//其他字符單詞
　　 break;
　　 }
　　
　　 offset = j;//設置取下一個詞的開始位置
　　 }
　　 else
　　 {
　　 offset = start;//設置取下一個詞的開始位置
　　 }
　　
　　 //返回token對象
　　 return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
　　 }
　　 //字符在字典中
　　 ReWord += char_s;
　　 //取得屬於當前字符的詞典樹
　　 t_chartable = (Hashtable)t_chartable[char_s];
　　 //設置下一循環取下一個詞的開始位置
　　 start++;
　　 if (start == dataLen)
　　 {
　　 offset = dataLen;
　　 return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
　　 }
　　 }
　　 return token;
　　 }
　　
　　 }
　　}