using System; using System.Collections.Generic; /* * 作者:熊仔其人 * 時間:2014年4月22日 */ namespace DataTool { /// <summary> /// 相似度 /// 熊仔其人 /// 2014年4月22日 /// </summary> public static class LevenshteinDistance { #region Levenshtein Distance算法(編輯距離算法) /// <summary> /// 三個數字中取最小的一個數字 /// </summary> /// <param name="first"></param> /// <param name="second"></param> /// <param name="third"></param> /// <returns></returns> private static int LowerOfThree(int first, int second, int third) { int min = first; if (second < min) min = second; if (third < min) min = third; return min; } /// <summary> /// 根據Levenshtein Distance算法(編輯距離算法)計算兩個字符串的相似度 /// </summary> /// <param name="text1"></param> /// <param name="text2"></param> /// <returns></returns> private static int Levenshtein_Distance(string text1, string text2) { int[,] Matrix; int n = text1.Length; int m = text2.Length; int temp = 0; char ch1; char ch2; int i = 0; int j = 0; if (n == 0) { return m; } if (m == 0) { return n; } Matrix = new int[n + 1, m + 1]; for (i = 0; i <= n; i++) { //初始化第一列 Matrix[i, 0] = i; } for (j = 0; j <= m; j++) { //初始化第一行 Matrix[0, j] = j; } for (i = 1; i <= n; i++) { ch1 = text1[i - 1]; for (j = 1; j <= m; j++) { ch2 = text2[j - 1]; if (ch1.Equals(ch2)) { temp = 0; } else { temp = 1; } Matrix[i, j] = LowerOfThree(Matrix[i - 1, j] + 1, Matrix[i, j - 1] + 1, Matrix[i - 1, j - 1] + temp); } } //for (i = 0; i <= n; i++) //{ // for (j = 0; j <= m; j++) // { // Console.Write(" {0} ", Matrix[i, j]); // } // Console.WriteLine(""); //} return Matrix[n, m]; } /// <summary> /// 根據Levenshtein Distance算法(編輯距離算法)計算兩個字符串的相似度(百分比) /// </summary> /// <param name="text1">第一個字符串</param> /// <param name="text2">第二個字符串</param> /// <returns>相似度(百分比)</returns> public static decimal LevenshteinDistancePercent(string text1, string text2) { if (string.IsNullOrEmpty(text1) && string.IsNullOrEmpty(text2)) return 1; else if (string.IsNullOrEmpty(text1) || string.IsNullOrEmpty(text2)) return 0; int maxLenth = text1.Length > text2.Length ? text1.Length : text2.Length; int val = Levenshtein_Distance(text1, text2); return 1 - (decimal)val / maxLenth; } #endregion #region 計算兩個字符串的相似度(百分比) /// <summary> /// 計算兩個字符串的相似度(百分比),比較每一個字符組成,返回結果相似度與字符順序有關,但是並不需要順序完全一致 /// </summary> /// <param name="text1">第一個字符串</param> /// <param name="text2">第二個字符串</param> /// <returns>相似度(百分比)</returns> public static decimal SimilarByStringPercent(string text1, string text2) { if (string.IsNullOrEmpty(text1) && string.IsNullOrEmpty(text2)) return 1; else if (string.IsNullOrEmpty(text1) || string.IsNullOrEmpty(text2)) return 0; decimal returnValue = 0; int maxLength; int i, l; List<string> tb1 = new List<string>(); List<string> tb2 = new List<string>(); i = 0; l = 1; maxLength = text1.Length; if (text1.Length < text2.Length) maxLength = text2.Length; while (l <= text1.Length) { while (i < text1.Length - 1) { if (i + l > text1.Length) break; tb1.Add(text1.Substring(i, l)); i++; } i = 0; l++; } i = 0; l = 1; while (l <= text2.Length) { while (i < text2.Length - 1) { if (i + l > text2.Length) break; tb2.Add(text2.Substring(i, l)); i++; } i = 0; l++; } foreach (string subStr in tb1) { decimal tempRe = 0; if (tb2.Contains(subStr)) { tempRe = (decimal)subStr.Length / maxLength; if (tempRe > returnValue) returnValue = tempRe; if (tempRe == 1) break; } } return returnValue; } #endregion } }