程式師世界 >> 編程語言 >> .NET網頁編程 >> .NET實例教程 >> Lucene分詞心得--使搜索轉義字符(如+)成為可能

Lucene分詞心得--使搜索轉義字符(如+)成為可能

編輯：.NET實例教程

想寫BLOG,又不知道寫些什麼（只是最近剛想是不是該寫個自已的BLOG）。於是乎，把以前曾經在項目中用到的一個東東寫上了，個人愚見:).
使用LUCENE時，搜索''c++''這樣的詞會出現空的結果．原因肯定是在創建索引時出了問題，創建索引就需要分詞，你沒搜到''C++'',就是因為沒有c++這樣一個分詞加到索引中．
　　說完就開工，我仿照英文的分詞方式，每個字母做一個分詞，也就是''c++''被分成''c'' ''+'' ''+''三個，然後加在索引中．

public sealed class ChineseTokenizer : Tokenizer
    ...{
        public ChineseTokenizer(TextReader _in)
        ...{
            input = _in;
        }

        private int offset = 0;//偏移量
        private int bufferIndex = 0;/**/////詞匯在緩沖中的位置
        private int dataLen = 0;//緩沖字符的長度
        private static int MAX_Word_LEN = 255;
        private static int IO_BUFFER_SIZE = 1024;
        private char[] buffer = new char[MAX_Word_LEN];
        private char[] ioBuffer = new char[IO_BUFFER_SIZE];

        private int length;//詞匯的長度
        private int start;//開始偏移量.

        private void Push(char c)

...{

            if (length == 0) start = offset - 1;   // 分詞起始位置
            buffer[length++] = Char.ToLower(c);  // 加入緩沖區

        }

        private Token Flush()
        ...{

            if (length > 0)
            ...{
                return new Token(new String(buffer, 0, length), start, start + length);
            }
            else
                return null;
        }

        public override Token Next()

...{

            length = 0;
            start = offset;

            while (true)
            ...{
                char c;
                offset++;

                if (bufferIndex >= dataLen)
                ...{
                    dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                    bufferIndex = 0;
                };

                if (dataLen == 0) return Flush();
                else
                    c = ioBuffer[bufferIndex++];

                if (c == '',''
                    || c == ''，''
                    || c == '';''
                    || c == ''；'')//排除字符, ; ，；參與分詞
                ...{
                    if (length > 0) return Flush();
                    continue;
                }
                switch (Char.GetUnicodeCategory(c))
                ...{
                    case UnicodeCategory.DecimalDigitNumber://指示字符是十進制數字；即在范圍 0 到 9 內
                    case UnicodeCategory.LowercaseLetter://指示字符是小寫字母
                    case UnicodeCategory.UppercaseLetter://指示字符是大寫字母

                        Push(c);
                        if (length == MAX_Word_LEN) return Flush();
                        break;
                    case UnicodeCategory.MathSymbol://指示字符是數學符號，例如“+”或“=”。
                    case UnicodeCategory.OpenPunctuation://指示字符是成對的標點符號（例如括號、方括號和大括號）之一的開始字符
                    case UnicodeCategory.ClosePunctuation://指示字符是成對的標點符號（例如括號、方括號和大括號）之一的封閉字符
                    case UnicodeCategory.CurrencySymbol://貨幣符號

                    case UnicodeCategory.DashPunctuation://指示字符是短劃線或連字符
                    case UnicodeCategory.ModifIErSymbol://指示字符是修飾符符號，這指示環繞的字符的修改例如：^
                    case UnicodeCategory.OtherPunctuation://指示字符是標點，但它不是連接符標點、短劃線標點、開始標點、結束標點、前引號標點或後引號標點,如:%
                    case UnicodeCategory.OtherLetter:
                        if (length > 0)
                        ...{
                            bufferIndex--;
                            offset--;
                            return Flush();
                        }
                        Push(c);
                        return Flush();

                    default:
                        if (length > 0)
                        ...{
                            return Flush();

}

                        break;
                }
            }

        }
    }這是最主要的分詞，你可以自已設置想要參與分詞的字符,放到你的Analyzer中使用吧．
順便貼一下網上的ChineseAnalyzer,為了配合使用，改過後如下：

public sealed class ChineseFilter : TokenFilter
    ...{
        public static String[] STOP_WordS =
                 ...{
                     "and", "are", "as", "at", "be", "but", "by",
                     "for", "if", "in", "into", "is", "it",
                     "no", "not", "of", "on", "or", "such",
                     "that", "the", "their", "then", "there", "these",
                     "they", "this", "to", "was", "will", "with"
                 };

        private Hashtable stopTable;

        public ChineseFilter(TokenStream _in)&nb
sp;: base (_in)
        ...{
            stopTable = new Hashtable(STOP_WordS.Length);

            for (int i = 0; i < STOP_WordS.Length; i++)
                stopTable[STOP_WORDS[i]] = STOP_WordS[i];
        }

        public override Token Next()
        ...{

            for (Token token = input.Next(); token != null; token = input.Next())
            ...{
                String text = token.TermText();

                if (stopTable[text] == null)
                ...{
           &nbs
p;        return token;

                }

            }
            return null;
        }
    }

public class ChineseAnalyzer : Analyzer
    ...{

        public ChineseAnalyzer()
        ...{
        }

        public override sealed TokenStream TokenStream(String fIEldName, TextReader reader)
        ...{
            TokenStream result = new ChineseTokenizer(reader);
            result = new ChineseFilter(result);
            return result;
        }
    }
  ...{
            length = 0;
            start = offset;

            while (true)
            ...{
                char c;
                offset++;

                if (bufferIndex >= dataLen)
                ...{
                    dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                    bufferIndex = 0;
                };

                if (dataLen == 0) return Flush();
                else
                    c = ioBuffer[bufferIndex++];

                if (c == '',''
                    || c == ''，''
                    || c == '';''
                    || c == ''；'')//排除字符, ; ，；參與分詞
                ...{
                    if (length > 0) return Flush();
                    continue;
                }
                switch (Char.GetUnicodeCategory(c))
                ...{
                    case UnicodeCategory.DecimalDigitNumber://指示字符是十進制數字；即在范圍 0 到 9 內
                    case UnicodeCategory.LowercaseLetter://指示字符是小寫字母
                    case UnicodeCategory.UppercaseLetter://指示字符是大寫字母

                        Push(c);
                        if (length == MAX_Word_LEN) return Flush();
                        break;
                    case UnicodeCategory.MathSymbol://指示字符是數學符號，例如“+”或“=”。
                    case UnicodeCategory.OpenPunctuation://指示字符是成對的標點符號（例如括號、方括號和大括號）之一的開始字符
                    case UnicodeCategory.ClosePunctuation://指示字符是成對的標點符號（例如括號、方括號和大括號）之一的封閉字符
                    case UnicodeCategory.CurrencySymbol://貨幣符號

>                    case UnicodeCategory.DashPunctuation://指示字符是短劃線或連字符
                    case UnicodeCategory.ModifIErSymbol://指示字符是修飾符符號，這指示環繞的字符的修改例如：^
                    case UnicodeCategory.OtherPunctuation://指示字符是標點，但它不是連接符標點、短劃線標點、開始標點、結束標點、前引號標點或後引號標點,如:%
                    case UnicodeCategory.OtherLetter:
                        if (length > 0)
                        ...{
                            bufferIndex--;
                            offset--;
                            return Flush();
                        }
                        Push(c);
                        return Flush();

                    default:
                        if (length > 0)
                        ...{
                            return Flush();
   }
                        break;
                }
            }

        }
    }這是最主要的分詞，你可以自已設置想要參與分詞的字符,放到你的Analyzer中使用吧．
順便貼一下網上的ChineseAnalyzer,為了配合使用，改過後如下：

public sealed class ChineseFilter : TokenFilter
    ...{
        public static String[] STOP_WordS =
                 ...{
                     "and", "are", "as", "at", "be", "but", "by",
                     "for", "if", "in", "into", "is", "it",
                     "no", "not", "of", "on", "or", "such",
                     "that", "the", "their", "then", "there", "these",
                     "they", "this", "to", "was", "will", "with"
                 };

        private Hashtable stopTable;

        public ChineseFilter(TokenStream _in) : base (_in)
        ...{
            stopTable = new Hashtable(STOP_WordS.Length);

            for (int i = 0; i < STOP_WordS.Length; i++)
                stopTable[STOP_WORDS[i]] = STOP_WordS[i];
        }

        public override Token Next()
        ...{

            for (Token token = input.Next(); token != null; token = input.Next())
            ...{
                String text = token.TermText();

                if (stopTable[text] == null)
                ...{
           &nbs
p;        return token;

                }

            }
            return null;
        }
    }

public class ChineseAnalyzer : Analyzer
    ...{

        public ChineseAnalyzer()
        ...{
        }

        public override sealed TokenStream TokenStream(String fIEldName, TextReader reader)
        ...{
            TokenStream result = new ChineseTokenizer(reader);
            result = new ChineseFilter(result);
            return result;
        }
    }