經過一天的研究,終於完成了C#在lucene.net下可以使用的中文切詞方法。感到有些復雜,不過我還是拿下了。頗有點成就感的,發上來跟大家分享一下!
在實現了中文切詞的基礎方法上,我將其封裝在繼承lucene的Analyzer類下
chineseAnalzer的方法就不用多說了。
以下是引用片段:
using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
namespace Lucene.Fanswo
{
/**////
///
///
public class ChineseAnalyzer:Analyzer
{
//private System.Collections.Hashtable stopSet;
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我們" };
/**//// Constructs a {@link StandardTokenizer} filtered by a {@link
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
///
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new ChineseTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
return result;
}
}
}
ChineseTokenizer類的實現:
這裡通過詞典來正向匹配字符,返回lucene下定義的token流
以下是引用片段:
using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using System.Collections;
using System.Text.RegularExpressions;
using System.IO;
namespace Lucene.Fanswo
{
class ChineseTokenizer : Tokenizer
{
private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量,當前字符的位置,字符長度
private int start;//開始位置
/**////
/// 存在字符內容
///
private string text;
/**////
/// 切詞所花費的時間
///
public double TextSeg_Span = 0;
/**//// Constructs a tokenizer for this Reader.
public ChineseTokenizer(System.IO.TextReader reader)
{
this.input = reader;
text = input.ReadToEnd();
dataLen = text.Length;
}
/**//// 進行切詞,返回數據流中下一個token或者數據流為空時返回null
///
///
public override Token Next()
{
Token token = null;
WordTree tree = new WordTree();
//讀取詞庫
tree.LoadDict();
//初始化詞庫,為樹形
Hashtable t_chartable = WordTree.chartable;
string ReWord = "";
string char_s;
start = offset;
bufferIndex = start;
while (true)
{
//開始位置超過字符長度退出循環
if (start >= dataLen)
{
break;
}
//獲取一個詞
char_s = text.Substring(start, 1);
if (string.IsNullOrEmpty(char_s.Trim()))
{
start++;
continue;
}
//字符不在字典中
if (!t_chartable.Contains(char_s))
{
if (ReWord == "")
{
int j = start + 1;
switch (tree.GetCharType(char_s))
{
case 0://中文單詞
ReWord += char_s;
break;
case 1://英文單詞
j = start + 1;
while (j < dataLen)
{
if (tree.GetCharType(text.Substring(j, 1)) != 1)
break;
j++;
}
ReWord += text.Substring(start, j - offset);
break;
case 2://數字
j = start + 1;
while (j < dataLen)
{
if (tree.GetCharType(text.Substring(j, 1)) != 2)
break;
j++;
}
ReWord += text.Substring(start, j - offset);
break;
default:
ReWord += char_s;//其他字符單詞
break;
}
offset = j;//設置取下一個詞的開始位置
}
else
{
offset = start;//設置取下一個詞的開始位置
}
//返回token對象
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
//字符在字典中
ReWord += char_s;
//取得屬於當前字符的詞典樹
t_chartable = (Hashtable)t_chartable[char_s];
//設置下一循環取下一個詞的開始位置
start++;
if (start == dataLen)
{
offset = dataLen;
return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);
}
}
return token;
}
}
}
測試的代碼: