程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> C#入門知識 >> Individual Project，individualproject

Individual Project，individualproject

編輯：C#入門知識

Individual Project，individualproject

作業說明詳見：http://www.cnblogs.com/jiel/p/3978727.html

一.預先准備和時間規劃

1.安裝Microsoft Visual Studio Ultimate 2012，之前安裝過一次，預計2小時左右，但是安裝過程中可以繼續進行其它任務。

2.閱讀題目要求，理解要實現的功能，預計20min左右。

3.根據題目，設計程序框架，預計10min左右。

4.閱讀相關文檔，學習所需要的類的命名空間，屬性和方法，預計1小時左右。

5.初步完成程序編寫，預計3小時左右。

6.設計測試數據，對出現的Bug進行改進，預計2小時左右。

二.實際用時和具體過程

1.安裝Microsoft Visual Studio Ultimate 2012，網址如下：

　　http://www.microsoft.com/zh-cn/download/confirmation.aspx?id=30678

網速較快用時1小時。

2.閱讀題目要求，理解要實現的功能：實現一個文本詞頻統計器，可以實現統計單詞數目，2聯短語數目，3聯短語數目。單詞比較忽略大小寫，輸出時按照詞頻從大到小排序，相等時按字典序排序，用時15min。

3.根據題目，設計程序框架，完善代碼：（用時大約4小時）

1 String[] files = Directory.GetFiles(path); View Code

　　（2）讀取每個文件的內容：

1 foreach (String i in files) 2 { 3 String extension = i.Substring(i.LastIndexOf(".")+1, i.Length - i.LastIndexOf(".")-1); 4 if (!extension.Equals("txt") && 5 !extension.Equals("cpp") && 6 !extension.Equals("h") && 7 !extension.Equals("cs")) continue;//只處理特定格式的文件 8 if (i.Equals(path+"\\"+"12061162.txt")) continue; 9 String[] text = rgxwords.Split(File.ReadAllText(i)); 10 } View Code

　　（3）對於獲取的文本信息整理：先將文本拆分成若干行或者句子，設定三個Regex類，分別匹配單個單詞，2兩個由單個空格隔開的單詞，3個由單個單詞隔開的單詞。調用Regex.Match()方法和Regex.NextMatch方法匹配所有可匹配項：

　　　　數據存放數組定義如下：

1 ArrayList data = new ArrayList();//當前小說的單詞數據 2 ArrayList word_word = new ArrayList();//e2模式短語的數據 3 ArrayList word_word_word = new ArrayList();//e3模式短語的數據 View Code

　　　　模板定義如下：

1 Regex rgxwords = new Regex("[\n\r,.\\(\\)\\{\\}\\{\\]:\"!;]+");//將文本拆分成一句或一行一個 2 Regex regword1 = new Regex("[a-zA-Z]{3}[0-9a-zA-Z]*");//Simple mode的模式 3 Regex regword2 = new Regex("[a-zA-Z]{3}[0-9a-zA-Z]* [a-zA-Z]{3}[0-9a-zA-Z]*");//Extended mode2的模式 4 Regex regword3 = new Regex("[a-zA-Z]{3}[0-9a-zA-Z]* [a-zA-Z]{3}[0-9a-zA-Z]* [a-zA-Z]{3}[0-9a-zA-Z]*");//Extended mode3的模式 View Code

　　　　具體過程如下：

1 Match match = regword1.Match(text[j], 0); 2 while (match.Success) 3 { 4 data.Add(new Data(match.Value, 1));//simple mode的數據更新 5 match = match.NextMatch(); 6 } 7 Match match2 = regword2.Match(text[j], 0); 8 Match match1 = regword1.Match(text[j],match2.Index); 9 while (match2.Success) 10 { 11 word_word.Add(new Data(match2.Value, 1)); 12 match2 = regword2.Match(text[j], match1.Index+1); 13 match1 = match1.NextMatch(); 14 }//extend mode2的數據更新 15 match2 = regword3.Match(text[j], 0); 16 match1 = regword1.Match(text[j],match2.Index); 17 while (match2.Success) 18 { 19 word_word_word.Add(new Data(match2.Value, 1)); 20 match2 = regword3.Match(text[j], match1.Index + 1); 21 match1 = match1.NextMatch(); 22 }//extend mode3的數據更新 View Code

　　　（4）對於存取數據的ArrayList類整理排序：主要運用ArrayList.sort(IComparer)方法，需要自己實現IComparer接口。

1 class myReverserClass1 : IComparer 2 //自定義比較器,用於字典序排序 3 { 4 int MyStringCompare(String x, String y) 5 //自定義了字符串比較方法： 6 //忽略大小寫排序,但是大寫相對靠前 7 //如hello,world,World,zoo 8 //排序後變成hello,World,world,zoo 9 { 10 int lx = x.Count(), ly = y.Count(), i; 11 String xx = x.ToUpper(); 12 String yy = y.ToUpper(); 13 for (i = 0; i < lx && i < ly; i++) 14 if (xx[i] == yy[i]) continue; 15 else return xx[i] - yy[i]; 16 if (i == lx && i < ly) return -1; 17 else if (i < lx && i == ly) return 1; 18 else 19 { 20 for (i = 0; i < lx && i < ly; i++) 21 if (x[i] == y[i]) continue; 22 else return y[i] - x[i]; 23 return 0; 24 } 25 } 26 int IComparer.Compare(Object x, Object y) 27 { 28 return MyStringCompare(((Data)y).word, ((Data)x).word); 29 } 30 } 31 class myReverserClass2 : IComparer 32 //自定義比較器,用於單詞頻率排序 33 { 34 int MyStringCompare(String x, String y) 35 //自定義了字符串比較方法： 36 //忽略大小寫排序,但是大寫相對靠前 37 //如hello,world,World,zoo 38 //排序後變成hello,World,world,zoo 39 { 40 int lx = x.Count(), ly = y.Count(), i; 41 String xx = x.ToUpper(); 42 String yy = y.ToUpper(); 43 for (i = 0; i < lx && i < ly; i++) 44 if (xx[i] == yy[i]) continue; 45 else return xx[i] - yy[i]; 46 if (i == lx && i < ly) return -1; 47 else if (i < lx && i == ly) return 1; 48 else 49 { 50 for (i = 0; i < lx && i < ly; i++) 51 if (x[i] == y[i]) continue; 52 else return y[i] - x[i]; 53 return 0; 54 } 55 } 56 int IComparer.Compare(Object x, Object y) 57 { 58 if (((Data)x).num > ((Data)y).num) return -1; 59 else if (((Data)x).num < ((Data)y).num) return 1; 60 else return MyStringCompare(((Data)x).word, ((Data)y).word); 61 } 62 } View Code

　　　設計相關方法去掉重復單詞，記錄次數：

1 static ArrayList Redelete(ArrayList array) 2 //刪掉重復出現的單詞，並統計出現次數 3 { 4 IComparer myComparer1 = new myReverserClass1(); 5 array.Sort(myComparer1); 6 for (int i = 0; i < array.Count - 1; i++) 7 { 8 9 Data now = (Data)array[i]; 10 Data nxt = (Data)array[i + 1]; 11 now.word = now.word.ToUpper(); 12 nxt.word = nxt.word.ToUpper(); 13 if (now.word.Equals(nxt.word) == true) 14 { 15 array.RemoveAt(i + 1); 16 Data temp = (Data)array[i]; 17 temp.num++; 18 array[i] = temp; 19 i--; 20 } 21 } 22 return array; 23 } 24 static ArrayList Resort(ArrayList array) 25 //按頻率排序 26 { 27 IComparer myComparer2 = new myReverserClass2(); 28 array.Sort(myComparer2); 29 return array; 30 } View Code

　　　（5）對數據調用自定義的Redelete方法和Resort方法，並輸出結構到指定文件下

1 static StreamWriter writer; View Code

1 writer = new StreamWriter(dirpath + "\\" + "12061162.txt"); View Code

1 data = Resort(Redelete(data)); 2 word_word = Resort(Redelete(word_word)); 3 word_word_word = Resort(Redelete(word_word_word)); 4 try 5 { 6 writer.WriteLine("文件地址：" + i); 7 writer.WriteLine("最常見2聯短語:"); 8 if (word_word.Count != 0) 9 for (int j = 0; j < Math.Min(word_word.Count,10); j++) 10 writer.WriteLine(((Data)word_word[j]).word); 11 else writer.WriteLine("不存在"); 12 writer.WriteLine("最常見3聯短語:"); 13 if (word_word_word.Count != 0) 14 for (int j = 0; j < Math.Min(word_word_word.Count, 10); j++) 15 writer.WriteLine(((Data)word_word_word[j]).word); 16 else writer.WriteLine("不存在"); 17 writer.WriteLine("所有單詞如下:"); 18 foreach (Data j in data) 19 writer.WriteLine(j.word + "------" + j.num); 20 } 21 catch (UnauthorizedAccessException e) 22 { 23 Console.WriteLine(e.ToString()); 24 } View Code

4.設計數據，數據如下：

http://pan.baidu.com/s/1mgyYOTM

三. 性能分析與改進

雖然排序用了ArrayList.Sort()方法，但是刪除重復時用了ArrayList.RemoveAt()方法，最壞可能是N^2的復雜度，這導致了算法復雜度極大升高，後來看了才意識到。可以調用Dictionary類，減少運行時間。改進代碼如下：（課程網站提交的是源代碼，想改時發現錯過提交時間了）

1 static ArrayList Redelete(ArrayList array) 2 //刪掉重復出現的單詞，並統計出現次數 3 { 4 IComparer myComparer1 = new myReverserClass1(); 5 array.Sort(myComparer1); 6 Dictionary<String,int> map = new Dictionary<String,int>(); 7 for (int i = 0; i < array.Count ; i++) 8 { 9 10 Data now = (Data)array[i]; 11 map.Add(now.word,1); 12 if (i == array.Count - 1) break; 13 Data nxt = (Data)array[i + 1]; 14 String temp = now.word.ToUpper(); 15 nxt.word = nxt.word.ToUpper(); 16 while (temp.Equals(nxt.word) == true) 17 { 18 map[now.word]++; 19 i++; 20 if (i + 1 == array.Count) break; 21 nxt = (Data)array[i+1]; 22 nxt.word = nxt.word.ToUpper(); 23 } 24 } 25 array = new ArrayList(); 26 foreach (KeyValuePair<String,int> kvp in map) 27 array.Add(new Data(kvp.Key, kvp.Value)); 28 return array; 29 } View Code

這組數據在改進前無法運行

改進後運行狀態為：

雖然也很慢，而且cpu運行有時比較高，但是可以出結果了，之前的40min也沒出結果。

從上面看出，我的代碼主要運行時間在IComparer比較上，排序時的比較總會調用MyStringCompare，比較一次，調用一次。故占用浪費時間較大。

因代碼編寫匆忙，故時間主要花費在實現功能上，而忽略了性能。

四.數據驗證

此數據可看出，可以顯現掃面三聯詞匯，二連詞匯，統計所有單詞詞頻，可以忽略大小寫。故程序正確性無誤。

五.感想

雖然不是第一次寫C#程序了，但是完成這次作業後才發現，自己C#知識不足，寫出的代碼風格接近java，對C#特有的一些知識沒有掌握。而且前期投入時間不足，導致後期忙於趕任務，完成的不是很好，性能不夠快，代碼全寫在主類中，全是靜態方法，風格不是很好。希望這次也給自己一個教訓，總結經驗，更好地完成下一次作業！