程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> C#入門知識 >> C# 中文分詞[基於統計的樸素貝葉斯算法]

C# 中文分詞[基於統計的樸素貝葉斯算法]

編輯：C#入門知識

主要思想:

1. 要有一個語料庫

2. 統計每個詞出現的頻率, 一會來做樸素貝葉斯候選

3. 舉例: 中國人民共和國的

    其中語料庫中有中國, 人民, 中國人, 共和國等等的詞組.

現在輸入: 中國人都愛中華人民共和國;

分詞的時候取max( 各種分發得到的score );

例如: solution1：中國人_都愛中華人民_共和國

solution2：中國_人_都愛中華人民_共和國

solution3：中國_人_都愛_中華_人民_共和國

bestSegSolution = max( solutions(segSlution[i] ));

4.對於一句漢字的分詞可以看做

seg( StringIn ) = firPart + seg(StringIn – firPart); // 我用score來衡量當前分詞結果的好壞

6。樸素貝葉斯的意思就是: 分詞後的, 兩個詞之間是相互獨立的, 也就是後者的出現與前者無關

5. 這個只是初級版, 很簡單, 需要再加點東西, 結果會更加的完美.. 當然, 按照做事情的原則, 都是從簡單開始做的, 再努力

using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Windows.Forms;
using System.IO;
using System.Diagnostics;

namespace ChineseWordSeg
{
    class NaiveBayes
    {
        private string wordLibPath = "../WordLib/pku_training.txt";//所用的訓練庫是pku的語料庫.

        bool trained = false;
        private Dictionary<string, long> wordLib = new Dictionary<string, long>();
        private Dictionary<string, long> singleWordLib = new Dictionary<string, long>();
        int maxLen = 0;
        long maxScore = 0;
        private string segPos = ""; //記錄單句的分割點, 按照標點等非漢字的字符分開
        private string segSentence = ""; // 記錄整個段落的

        // 是不是中文字符

        bool isChineseWord(char chr ){
            if (chr >= 0x4E00 && chr <= 0x9FFF) return true;
            return false;
        }

        public void trainDate( string path ) {
 // 統計每個詞出現的次數

//1. 統計每個詞組頻率, naiveBayes消歧. 將一個組合不同的方式取得較大概率的那個分組方式.
// 難道每個詞還是hash一下麼?
//2. 統計每個字的頻率, 就像向心力那樣... 看看到底哪兩個字比較容易聯系到一起這個是一句廢話,因為我沒這麼去做
            wordLib.Clear();

            DirectoryInfo dirInfo = new DirectoryInfo(path);
            DirectoryInfo tmpDir = dirInfo.Parent;
            string savePath = tmpDir.FullName;
            FileInfo fInfo = new FileInfo(wordLibPath);
            string fileNamePre = fInfo.Name;
            savePath += "\" + fileNamePre + "_trained";
            FileInfo infoOfDB = new FileInfo(savePath);

            if( File.Exists(savePath) && infoOfDB.Length > 0 ){

                StreamReader sr1 =
                                new StreamReader(@savePath);
                char[] sep = { };

                while (sr1.Peek()!=-1)
                {
                    string[] keyValue = sr1.ReadLine().Split(sep);

                    wordLib[keyValue[0]] = Convert.ToInt32(keyValue[1]);

                }

                    return;
            }