想寫BLOG,又不知道寫些什麼(只是最近剛想是不是該寫個自已的BLOG)。於是乎,把以前曾經在項目中用到的一個東東寫上了,個人愚見:).
使用LUCENE時,搜索''c++''這樣的詞會出現空的結果.原因肯定是在創建索引時出了問題,創建索引就需要分詞,你沒搜到''C++'',就是因為沒有c++這樣一個分詞加到索引中.
說完就開工,我仿照英文的分詞方式,每個字母做一個分詞,也就是''c++''被分成''c'' ''+'' ''+''三個,然後加在索引中.
public sealed class ChineseTokenizer : Tokenizer
...{
public ChineseTokenizer(TextReader _in)
...{
input = _in;
}
private int offset = 0;//偏移量
private int bufferIndex = 0;/**/////詞匯在緩沖中的位置
private int dataLen = 0;//緩沖字符的長度
private static int MAX_Word_LEN = 255;
private static int IO_BUFFER_SIZE = 1024;
private char[] buffer = new char[MAX_Word_LEN];
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;//詞匯的長度
private int start;//開始偏移量.
private void Push(char c)
...{
if (length == 0) start = offset - 1; // 分詞起始位置
buffer[length++] = Char.ToLower(c); // 加入緩沖區
}
private Token Flush()
...{
if (length > 0)
...{
return new Token(new String(buffer, 0, length), start, start + length);
}
else
return null;
}
public override Token Next()
...{
length = 0;
start = offset;
while (true)
...{
char c;
offset++;
if (bufferIndex >= dataLen)
...{
dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
bufferIndex = 0;
};
if (dataLen == 0) return Flush();
else
c = ioBuffer[bufferIndex++];
if (c == '',''
|| c == '',''
|| c == '';''
|| c == '';'')//排除字符, ; ,;參與分詞
...{
if (length > 0) return Flush();
continue;
}
switch (Char.GetUnicodeCategory(c))
...{
case UnicodeCategory.DecimalDigitNumber://指示字符是十進制數字;即在范圍 0 到 9 內
case UnicodeCategory.LowercaseLetter://指示字符是小寫字母
case UnicodeCategory.UppercaseLetter://指示字符是大寫字母
Push(c);
if (length == MAX_Word_LEN) return Flush();
break;
case UnicodeCategory.MathSymbol://指示字符是數學符號,例如“+”或“=”。
case UnicodeCategory.OpenPunctuation://指示字符是成對的標點符號(例如括號、方括號和大括號)之一的開始字符
case UnicodeCategory.ClosePunctuation://指示字符是成對的標點符號(例如括號、方括號和大括號)之一的封閉字符
case UnicodeCategory.CurrencySymbol://貨幣符號
>
case UnicodeCategory.DashPunctuation://指示字符是短劃線或連字符
case UnicodeCategory.ModifIErSymbol://指示字符是修飾符符號,這指示環繞的字符的修改 例如:^
case UnicodeCategory.OtherPunctuation://指示字符是標點,但它不是連接符標點、短劃線標點、開始標點、結束標點、前引號標點或後引號標點,如:%
case UnicodeCategory.OtherLetter:
if (length > 0)
...{
bufferIndex--;
offset--;
return Flush();
}
Push(c);
return Flush();
default:
if (length > 0)
...{
return Flush();
}
break;
}
}
}
}這是最主要的分詞,你可以自已設置想要參與分詞的字符,放到你的Analyzer中使用吧.
順便貼一下網上的ChineseAnalyzer,為了配合使用,改過後如下:
public sealed class ChineseFilter : TokenFilter
...{
public static String[] STOP_WordS =
...{
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private Hashtable stopTable;
public ChineseFilter(TokenStream _in)&nb
sp;: base (_in)
...{
stopTable = new Hashtable(STOP_WordS.Length);
for (int i = 0; i < STOP_WordS.Length; i++)
stopTable[STOP_WORDS[i]] = STOP_WordS[i];
}
public override Token Next()
...{
for (Token token = input.Next(); token != null; token = input.Next())
...{
String text = token.TermText();
if (stopTable[text] == null)
...{
&nbs
p; return token;
}
}
return null;
}
}
public class ChineseAnalyzer : Analyzer
...{
public ChineseAnalyzer()
...{
}
public override sealed TokenStream TokenStream(String fIEldName, TextReader reader)
...{
TokenStream result = new ChineseTokenizer(reader);
result = new ChineseFilter(result);
return result;
}
}
...{
length = 0;
start = offset;
while (true)
...{
char c;
offset++;
if (bufferIndex >= dataLen)
...{
dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
bufferIndex = 0;
};
if (dataLen == 0) return Flush();
else
c = ioBuffer[bufferIndex++];
if (c == '',''
|| c == '',''
|| c == '';''
|| c == '';'')//排除字符, ; ,;參與分詞
...{
if (length > 0) return Flush();
continue;
}
switch (Char.GetUnicodeCategory(c))
...{
case UnicodeCategory.DecimalDigitNumber://指示字符是十進制數字;即在范圍 0 到 9 內
case UnicodeCategory.LowercaseLetter://指示字符是小寫字母
case UnicodeCategory.UppercaseLetter://指示字符是大寫字母
Push(c);
if (length == MAX_Word_LEN) return Flush();
break;
case UnicodeCategory.MathSymbol://指示字符是數學符號,例如“+”或“=”。
case UnicodeCategory.OpenPunctuation://指示字符是成對的標點符號(例如括號、方括號和大括號)之一的開始字符
case UnicodeCategory.ClosePunctuation://指示字符是成對的標點符號(例如括號、方括號和大括號)之一的封閉字符
case UnicodeCategory.CurrencySymbol://貨幣符號
> case UnicodeCategory.DashPunctuation://指示字符是短劃線或連字符
case UnicodeCategory.ModifIErSymbol://指示字符是修飾符符號,這指示環繞的字符的修改 例如:^
case UnicodeCategory.OtherPunctuation://指示字符是標點,但它不是連接符標點、短劃線標點、開始標點、結束標點、前引號標點或後引號標點,如:%
case UnicodeCategory.OtherLetter:
if (length > 0)
...{
bufferIndex--;
offset--;
return Flush();
}
Push(c);
return Flush();
default:
if (length > 0)
...{
return Flush();
}
break;
}
}
}
}這是最主要的分詞,你可以自已設置想要參與分詞的字符,放到你的Analyzer中使用吧.
順便貼一下網上的ChineseAnalyzer,為了配合使用,改過後如下:
public sealed class ChineseFilter : TokenFilter
...{
public static String[] STOP_WordS =
...{
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private Hashtable stopTable;
public ChineseFilter(TokenStream _in) : base (_in)
...{
stopTable = new Hashtable(STOP_WordS.Length);
for (int i = 0; i < STOP_WordS.Length; i++)
stopTable[STOP_WORDS[i]] = STOP_WordS[i];
}
public override Token Next()
...{
for (Token token = input.Next(); token != null; token = input.Next())
...{
String text = token.TermText();
if (stopTable[text] == null)
...{
&nbs
p; return token;
}
}
return null;
}
}
public class ChineseAnalyzer : Analyzer
...{
public ChineseAnalyzer()
...{
}
public override sealed TokenStream TokenStream(String fIEldName, TextReader reader)
...{
TokenStream result = new ChineseTokenizer(reader);
result = new ChineseFilter(result);
return result;
}
}