在博客園學習知識是很方便的,但若做成客戶端,自定義獲取數據,那就更好啦!
那麼需求有哪些呢,第一,我只查看推薦數大於2的文章;第二,我想要只查看C#或者Java的文章;第三,我想要查看推薦數大於2的新聞;第四,我還想搜索文章,並且只搜索推薦數大於2的文章。
先來預覽一下成品吧
其中列表裡左邊是推薦數,反正我是優先看推薦數多的,中間是標題,右邊是日期,至於其他信息,額,我其實不太關心,點擊一行後直接在浏覽器打開。
額,大體先這樣吧,那麼實現這些功能需要什麼技能呢,首先我得准備一下通用類,大概需要web請求的幫助類、Gzip格式網頁的加解密幫助類、html字符串解析的幫助類。
public class WebHelper { public readonly WebClient Web = new WebClient(); //錯誤重試次數 private int _tryTimes; public Encoding Encoding { set { Web.Encoding = value; } } public WebHelper() { Web.Encoding = Encoding.UTF8; } public WebHelper(Encoding encoding) { Web.Encoding = encoding; } /// <summary> /// 下載請求的資源 /// </summary> /// <param name="url">URL</param> /// <returns></returns> public string DownloadString(string url) { try { return Web.DownloadString(url); } catch(WebException e) { if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2) { _tryTimes = 0; return null; } _tryTimes++; return DownloadString(url); } } /// <summary> /// 將指定的字符串上載到指定的資源 /// </summary> /// <param name="address">地址</param> /// <param name="data">參數</param> /// <returns></returns> public string UploadString(string address, string data) { Web.Headers.Add("Content-Type", "application/x-www-form-urlencoded"); try { return Web.UploadString(address, "POST", data); } catch { if (_tryTimes == 2) { _tryTimes = 0; return null; } _tryTimes++; return UploadString(address, data); } } /// <summary> /// 下載請求的資源(資源采用Gzip壓縮) /// </summary> /// <param name="url">URL</param> /// <param name="encoding">頁面編碼格式</param> /// <returns></returns> public string DownloadGzipString(string url, Encoding encoding) { Web.Headers.Add("Accept-Encoding", "gzip"); try { return encoding.GetString(ZipHelper.GzipDecompress(Web.DownloadData(url))); } catch (WebException e) { if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2) { _tryTimes = 0; return null; } _tryTimes++; return DownloadGzipString(url, encoding); } finally { Web.Headers.Remove("Accept-Encoding"); } } }
這裡有三個方法,其中的DownloadString和UploadString和.net Framework的WebClient的方法用法一樣,多了一個DownloadGzipString方法,這個方法用於get一個用Gzip壓縮的頁面,之所以重復寫DownloadString和UploadString是因為我懶,有時候請求網頁出現異常並不是該網頁不能請求,多請求幾次就能獲取,這裡自動嘗試3次請求,3次請求過後依然失敗則返回null。當然還有一種情況是需要用代理的,考慮到需要用代理的地方不多,並且代理的IP端口一般需要花錢來買,這裡就不貼用代理來請求頁面的代碼了,之前買過兩天耍過代理,我那時候的實現思路就是加一個ProxyPool代理池類,代理池從代理網站獲取當前可用的代理,一般是一次獲取十幾個,然後放入代理池,請求需要代理的網站時就去代理池獲取代理,WebClient.Proxy = new WebProxy(host, port);加了這個再去請求頁面就可以了,當然代理不一定可靠,所以當失敗後不要灰心,再用其他代理試試,總有一個成功的,當需要多線程請求網頁時,就new多個WebHelper類,他們都會共用一個ProxyPool代理池的。
public class ZipHelper { /// <summary> /// Gzip壓縮 /// </summary> /// <param name="cbytes">需壓縮的數據</param> /// <returns></returns> public static byte[] GzipCompress(byte[] cbytes) { using (MemoryStream cms = new MemoryStream()) { using (GZipStream gzip = new GZipStream(cms, CompressionMode.Compress)) { //將數據寫入基礎流,同時會被壓縮 gzip.Write(cbytes, 0, cbytes.Length); } return cms.ToArray(); } } /// <summary> /// Gzip解壓 /// </summary> /// <param name="cbytes">需解壓的數據</param> /// <returns></returns> public static byte[] GzipDecompress(byte[] cbytes) { using (MemoryStream dms = new MemoryStream()) { using (MemoryStream cms = new MemoryStream(cbytes)) { using (GZipStream gzip = new GZipStream(cms, CompressionMode.Decompress)) { byte[] bytes = new byte[1024]; int len = 0; //讀取壓縮流,同時會被解壓 while ((len = gzip.Read(bytes, 0, bytes.Length)) > 0) { dms.Write(bytes, 0, len); } return dms.ToArray(); } } } } }
public class StringHelper { /// <summary> /// 根據傳入str進行遍歷取出列表 /// </summary> /// <param name="str">傳入字符串</param> /// <param name="startStr">開始字符串</param> /// <param name="endStr">結束字符串</param> /// <param name="remove">是否去除開始和結束字符串取出數據</param> /// <returns></returns> public static List<string> GetList(string str, string startStr, string endStr, bool remove = true) { var lst = new List<string>(); int startIndex = 0; while (true) { string v = GetVal(str, startStr, endStr, remove, ref startIndex); if (startIndex == -1) { break; } lst.Add(v); } return lst; } public static string GetVal(string str, string startStr, string endStr, bool remove = true, int startIndex = 0) { return GetVal(str, startStr, endStr, remove, ref startIndex); } private static string GetVal(string str, string startStr, string endStr, bool remove, ref int startIndex) { int istart = str.IndexOf(startStr, startIndex, StringComparison.CurrentCulture); if (istart == -1) { startIndex = -1; return string.Empty; } int iend = str.IndexOf(endStr, istart + startStr.Length, StringComparison.Ordinal); if (iend == -1) { startIndex = -1; return string.Empty; } startIndex = iend + endStr.Length; if (remove) { istart += startStr.Length; return str.Substring(istart, iend - istart); } return str.Substring(istart, startIndex - istart); } /// <summary> /// 根據傳入str進行遍歷取出列表 /// </summary> /// <param name="str">傳入字符串</param> /// <param name="startStr">開始字符串</param> /// <param name="needLength">需要獲取的長度(不含開始字符串的長度)</param> /// <param name="remove">是否去除開始字符串取出數據</param> /// <returns></returns> public static List<string> GetList(string str, string startStr, int needLength, bool remove = true) { var lst = new List<string>(); int startIndex = 0; while (true) { string v = GetVal(str, startStr, needLength, remove, ref startIndex); if (startIndex == -1) { break; } lst.Add(v); } return lst; } public static string GetVal(string str, string startStr, int needLength, bool remove = true, int startIndex = 0) { return GetVal(str, startStr, needLength, remove, ref startIndex); } public static string GetVal(string str, string startStr, int needLength, bool remove, ref int startIndex) { int istart = str.IndexOf(startStr, startIndex, StringComparison.Ordinal); if (istart == -1) { startIndex = -1; return string.Empty; } startIndex = istart + startStr.Length + needLength; if (startIndex > str.Length) { startIndex = -1; return string.Empty; } return remove ? str.Substring(istart + startStr.Length, needLength) : str.Substring(istart, startStr.Length + needLength); } /// <summary> /// 獲取字符串裡的所有href鏈接 /// </summary> /// <param name="str">字符串</param> /// <returns></returns> public static List<string> GetUrls(string str) { return GetList(str, "href=\"", "\""); } /// <summary> /// 獲取字符串裡的首個href鏈接 /// </summary> /// <param name="str"></param> /// <returns></returns> public static string GetUrl(string str) { return GetVal(str, "href=\"", "\""); } public static string ToGB2312(string str) { string r = ""; MatchCollection mc = Regex.Matches(str, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase); var bts = new byte[2]; foreach (Match m in mc) { bts[0] = (byte) int.Parse(m.Groups[2].Value, NumberStyles.HexNumber); bts[1] = (byte) int.Parse(m.Groups[1].Value, NumberStyles.HexNumber); r += Encoding.Unicode.GetString(bts); } return r; } /// <summary> /// 除去所有在html元素中標記 /// </summary> /// <param name="html"></param> /// <returns></returns> public static string RemoveHTMLTags(string html) { Regex regex = new Regex(@"<[^>]+>|</[^>]+>"); return regex.Replace(html, ""); } }View Code
這裡主要包含了GetList、RemoveHTMLTags和GetVal方法,爬蟲解析數據就靠他們了,具體的使用方法下面會有講解。
到這裡通用類大體就介紹完了,現在開始實地施工。
/// <summary> /// 獲取推薦數大於2的博客 /// </summary> /// <param name="pageIndex"></param> private bool AddPost(int pageIndex) { var url = "https://www.cnblogs.com/mvc/AggSite/PostList.aspx"; var html = _web.UploadString(url, GetUrl() + pageIndex); var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment"); if (posts.Count == 0) { return false; } foreach (var item in posts) { var n = StringHelper.GetVal(item, "\"diggnum", "/span>"); var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<")); if (diggnum < 3) { continue; } var t = StringHelper.GetVal(item, "\"titlelnk", "/a>"); var title = StringHelper.GetVal(t, ">", "<"); var time = StringHelper.GetVal(item, "發布於 ", 16); _urls.Add(StringHelper.GetUrl(t)); lstPost.Items.Add($"{diggnum} {title} {time}"); } return true; } /// <summary> /// 添加搜索的博客 /// </summary> /// <param name="pageIndex">頁數</param> private bool AddSearchPost(int pageIndex) { var url = $"http://zzk.cnblogs.com/s/blogpost?Keywords={txtSearch.Text.Trim()}&pageindex={pageIndex}";var html = _web.DownloadGzipString(url, Encoding.UTF8); var posts = StringHelper.GetList(html, "\"searchItem", "\"searchItemInfo-comments"); if (posts.Count == 0) { return false; } foreach (var item in posts) { var diggnum = StringHelper.GetVal(item, ">推薦(", ")"); var n = StringHelper.GetVal(item, "searchItemTitle\">", "</h3>"); var title = StringHelper.RemoveHTMLTags(StringHelper.GetVal(n, "\">", "</a>")); var date = StringHelper.GetVal(item, "searchItemInfo-publishDate\">", "</span>"); _urls.Add(StringHelper.GetUrl(n)); lstPost.Items.Add($"{diggnum} {title} {date}"); } return true; } /// <summary> /// 獲取推薦數大於2的新聞 /// </summary> /// <param name="pageIndex"></param> private bool AddNews(int pageIndex) { var url = "https://www.cnblogs.com/mvc/AggSite/NewsList.aspx"; var html = _web.UploadString(url, $"CategoryId=-1&CategoryType=News&ItemListActionName=NewsList&ItemListActionName=NewsList&PageIndex=" + pageIndex); var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment"); if (posts.Count == 0) { return false; } foreach (var item in posts) { var n = StringHelper.GetVal(item, "\"diggnum", "/span>"); var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<")); if (diggnum < 3) { continue; } var t = StringHelper.GetVal(item, "\"titlelnk", "/a>"); var title = StringHelper.GetVal(t, ">", "<"); var time = StringHelper.GetVal(item, "發布於 ", 16); var link = StringHelper.GetUrl(t); if (!link.Contains("http")) { link = "https:" + link; } _urls.Add(link); lstPost.Items.Add($"{diggnum} {title} {time}"); } return true; }
授人以魚不然授人以漁,這些是怎麼回事呢
在博客園首頁按下F12,點擊下一頁,看看那些請求,瞄一瞄,就知道PostList.aspx是數據關鍵,裡面的參數中CategoryId是分類ID,CategoryType是分類種類,暫時發現SiteHome和TopSiteCategory兩個值,當點擊母分類時,這個值就是TopSiteCategory,當點擊子分類時,這個值就是SiteHome,PageIndex當前頁這個眾所周知啦,ParentCategoryId是父分類的ID,只有點擊子分類時需要把父分類的ID賦值到這個字段。說了這麼多,這個還只是獲取文章的接口,另外兩個查詢文章的和獲取新聞的也大同小異啦,大家自己研究。另外貼出的代碼裡有個GetUrl方法,這個就是為了賦值這些參數的,也貼出來吧
private string GetUrl() { string categoryId = "808"; string categoryType = "SiteHome"; string parentCategoryId = "0"; switch (cbbCate.SelectedIndex) { case 0: parentCategoryId = "108698"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "18156"; break; case 1: categoryId = "108699"; break; case 2: categoryId = "108700"; break; case 3: categoryId = "108760"; break; case 4: categoryId = "108716"; break; case 5: categoryId = "108717"; break; case 6: categoryId = "108718"; break; case 7: categoryId = "108719"; break; case 8: categoryId = "108720"; break; case 9: categoryId = "108728"; break; case 10: categoryId = "108729"; break; case 11: categoryId = "108730"; break; case 12: categoryId = "108738"; break; case 13: categoryId = "108739"; break; case 14: categoryId = "108758"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 1: parentCategoryId = "2"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "106876"; break; case 1: categoryId = "106880"; break; case 2: categoryId = "106882"; break; case 3: categoryId = "106877"; break; case 4: categoryId = "108696"; break; case 5: categoryId = "106894"; break; case 6: categoryId = "108735"; break; case 7: categoryId = "108746"; break; case 8: categoryId = "108748"; break; case 9: categoryId = "108751"; break; case 10: categoryId = "108752"; break; case 11: categoryId = "108753"; break; case 12: categoryId = "108742"; break; case 13: categoryId = "108754"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 2: parentCategoryId = "108701"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "106892"; break; case 1: categoryId = "108702"; break; case 2: categoryId = "106884"; break; case 3: categoryId = "108750"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 3: parentCategoryId = "108703"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "106883"; break; case 1: categoryId = "106893"; break; case 2: categoryId = "108731"; break; case 3: categoryId = "108737"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 4: parentCategoryId = "108704"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "78111"; break; case 1: categoryId = "50349"; break; case 2: categoryId = "106878"; break; case 3: categoryId = "108732"; break; case 4: categoryId = "108734"; break; case 5: categoryId = "108747"; break; case 6: categoryId = "108749"; break; case 7: categoryId = "3"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 5: parentCategoryId = "108705"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108706"; break; case 1: categoryId = "108707"; break; case 2: categoryId = "108736"; break; case 3: categoryId = "108708"; break; case 4: categoryId = "106886"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 6: parentCategoryId = "108709"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108710"; break; case 1: categoryId = "106891"; break; case 2: categoryId = "106889"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 7: parentCategoryId = "108712"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108713"; break; case 1: categoryId = "108714"; break; case 2: categoryId = "108715"; break; case 3: categoryId = "108743"; break; case 4: categoryId = "108756"; break; case 5: categoryId = "106881"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 8: parentCategoryId = "108724"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108721"; break; case 1: categoryId = "108725"; break; case 2: categoryId = "108726"; break; case 3: categoryId = "108755"; break; case 4: categoryId = "108757"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 9: parentCategoryId = "4"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "807"; break; case 1: categoryId = "106879"; break; case 2: categoryId = "33909"; break; case 3: categoryId = "106885"; break; case 4: categoryId = "106895"; break; case 5: categoryId = "108759"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; } return $"CategoryId={categoryId}&CategoryType={categoryType}&ParentCategoryId={parentCategoryId}&ItemListActionName=PostList&PageIndex="; }View Code
功能大體介紹完了,末了還有個小驚喜,就是提示框,怎麼在Winform中彈出提示框,過段時間自動消失呢,像這樣
其實這個不難,弄個定時器就好啦
但需要注意的是,怎麼才能彈出提示在最頂層呢,不然看不到呢,其實把TopMost屬性設為True就好了,另外ShowIcon、ShowInTaskbar、MaximizeBox和MinimizeBox也要設為false,StartPosition設為CenterScreen,這樣才專業。
由於剛弄成,難免會有疏忽八哥,大家看到後要幫忙指正,附上代碼博客園精華客戶端。