在編寫網絡爬蟲時,HttpWebRequest幾乎可以完成絕大多數網站的抓取,為了更好的使用這一技術,我將常用的幾個功能進行了封裝,以方便調用。這個類已經在多個項目中得到使用,主要解決了Cookies相關的一些問題;如果有其它方面的問題可以提出來,我會進一步完善。
目前HttpHelper包含了以下幾個方面:
代碼如下:
1 using System; 2 using System.Collections.Generic; 3 using System.Collections.Specialized; 4 using System.IO; 5 using System.IO.Compression; 6 using System.Linq; 7 using System.Net; 8 using System.Net.Security; 9 using System.Security.Cryptography.X509Certificates; 10 using System.Text; 11 using System.Text.RegularExpressions; 12 using System.Collections; 13 using HtmlAgilityPack; 14 15 namespace TNIdea.Common.Helper 16 { 17 public class HttpHelper 18 { 19 public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^\s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^\s"">]+)""?)"; 20 21 /// <summary> 22 /// 獲取網頁的內容 23 /// </summary> 24 /// <param name="url">Url</param> 25 /// <param name="postData">Post的信息</param> 26 /// <param name="cookies">Cookies</param> 27 /// <param name="userAgent">浏覽器標識</param> 28 /// <param name="referer">來源頁</param> 29 /// <param name="cookiesDomain">Cookies的Domian參數,配合cookies使用;為空則取url的Host</param> 30 /// <param name="encode">編碼方式,用於解析html</param> 31 /// <returns></returns> 32 public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null) 33 { 34 try 35 { 36 HttpWebResponse httpResponse = null; 37 if (!string.IsNullOrWhiteSpace(postData)) 38 httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer); 39 else 40 httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer); 41 42 #region 根據Html頭判斷 43 string Content = null; 44 //緩沖區長度 45 const int N_CacheLength = 10000; 46 //頭部預讀取緩沖區,字節形式 47 var bytes = new List<byte>(); 48 int count = 0; 49 //頭部預讀取緩沖區,字符串 50 String cache = string.Empty; 51 52 //創建流對象並解碼 53 Stream ResponseStream; 54 switch (httpResponse.ContentEncoding.ToUpperInvariant()) 55 { 56 case "GZIP": 57 ResponseStream = new GZipStream( 58 httpResponse.GetResponseStream(), CompressionMode.Decompress); 59 break; 60 case "DEFLATE": 61 ResponseStream = new DeflateStream( 62 httpResponse.GetResponseStream(), CompressionMode.Decompress); 63 break; 64 default: 65 ResponseStream = httpResponse.GetResponseStream(); 66 break; 67 } 68 69 try 70 { 71 while ( 72 !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase) 73 || count >= N_CacheLength)) 74 { 75 var b = (byte)ResponseStream.ReadByte(); 76 if (b < 0) //end of stream 77 { 78 break; 79 } 80 bytes.Add(b); 81 82 count++; 83 cache += (char)b; 84 } 85 86 87 if (encode == null) 88 { 89 try 90 { 91 if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn") 92 { 93 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline); 94 if (match.Success) 95 { 96 try 97 { 98 string charset = match.Groups["Charset"].Value; 99 encode = Encoding.GetEncoding(charset); 100 } 101 catch { } 102 } 103 else 104 encode = Encoding.GetEncoding("GB2312"); 105 } 106 else 107 encode = Encoding.GetEncoding(httpResponse.CharacterSet); 108 } 109 catch { } 110 } 111 112 //緩沖字節重新編碼,然後再把流讀完 113 var Reader = new StreamReader(ResponseStream, encode); 114 Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd(); 115 Reader.Close(); 116 } 117 catch (Exception ex) 118 { 119 return ex.ToString(); 120 } 121 finally 122 { 123 httpResponse.Close(); 124 } 125 #endregion 根據Html頭判斷 126 127 //獲取返回的Cookies,支持httponly 128 if (string.IsNullOrWhiteSpace(cookiesDomain)) 129 cookiesDomain = httpResponse.ResponseUri.Host; 130 131 cookies = new CookieContainer(); 132 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain); 133 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies); 134 135 return Content; 136 } 137 catch 138 { 139 return string.Empty; 140 } 141 } 142 143 144 /// <summary> 145 /// 創建GET方式的HTTP請求 146 /// </summary> 147 /// <param name="url"></param> 148 /// <param name="timeout"></param> 149 /// <param name="userAgent"></param> 150 /// <param name="cookies"></param> 151 /// <param name="referer"></param> 152 /// <returns></returns> 153 public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "") 154 { 155 HttpWebRequest request = null; 156 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) 157 { 158 //對服務端證書進行有效性校驗(非第三方權威機構頒發的證書,如自己生成的,不進行驗證,這裡返回true) 159 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); 160 request = WebRequest.Create(url) as HttpWebRequest; 161 //request.ProtocolVersion = HttpVersion.Version10; //http版本,默認是1.1,這裡設置為1.0 162 } 163 else 164 { 165 request = WebRequest.Create(url) as HttpWebRequest; 166 } 167 168 request.Referer = referer; 169 request.Method = "GET"; 170 171 //設置代理UserAgent和超時 172 if (string.IsNullOrWhiteSpace(userAgent)) 173 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36"; 174 175 request.UserAgent = userAgent; 176 request.Timeout = timeout; 177 request.KeepAlive = true; 178 request.AllowAutoRedirect = true; 179 180 if (cookies == null) 181 cookies = new CookieContainer(); 182 request.CookieContainer = cookies; 183 184 return request.GetResponse() as HttpWebResponse; 185 } 186 187 /// <summary> 188 /// 創建POST方式的HTTP請求 189 /// </summary> 190 /// <param name="url"></param> 191 /// <param name="postData"></param> 192 /// <param name="timeout"></param> 193 /// <param name="userAgent"></param> 194 /// <param name="cookies"></param> 195 /// <param name="referer"></param> 196 /// <returns></returns> 197 public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "") 198 { 199 HttpWebRequest request = null; 200 //如果是發送HTTPS請求 201 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase)) 202 { 203 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); 204 request = WebRequest.Create(url) as HttpWebRequest; 205 //request.ProtocolVersion = HttpVersion.Version10; 206 } 207 else 208 { 209 request = WebRequest.Create(url) as HttpWebRequest; 210 } 211 request.Referer = referer; 212 request.Method = "POST"; 213 request.ContentType = "application/x-www-form-urlencoded"; 214 215 //設置代理UserAgent和超時 216 if (string.IsNullOrWhiteSpace(userAgent)) 217 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"; 218 else 219 request.UserAgent = userAgent; 220 request.Timeout = timeout; 221 request.KeepAlive = true; 222 request.AllowAutoRedirect = true; 223 224 if (cookies == null) 225 cookies = new CookieContainer(); 226 request.CookieContainer = cookies; 227 228 //發送POST數據 229 if (!string.IsNullOrWhiteSpace(postData)) 230 { 231 byte[] data = Encoding.UTF8.GetBytes(postData); 232 request.ContentLength = data.Length; 233 using (Stream stream = request.GetRequestStream()) 234 { 235 stream.Write(data, 0, data.Length); 236 } 237 } 238 //string[] values = request.Headers.GetValues("Content-Type"); 239 return request.GetResponse() as HttpWebResponse; 240 } 241 242 /// <summary> 243 /// 驗證證書 244 /// </summary> 245 /// <param name="sender"></param> 246 /// <param name="certificate"></param> 247 /// <param name="chain"></param> 248 /// <param name="errors"></param> 249 /// <returns>是否驗證通過</returns> 250 private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) 251 { 252 if (errors == SslPolicyErrors.None) 253 return true; 254 return false; 255 } 256 257 /// <summary> 258 /// 根據response中頭部的set-cookie對request中的cookie進行設置 259 /// </summary> 260 /// <param name="setCookie">The set cookie.</param> 261 /// <param name="defaultDomain">The default domain.</param> 262 /// <returns></returns> 263 private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain) 264 { 265 try 266 { 267 string[] setCookie = response.Headers.GetValues("Set-Cookie"); 268 269 // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces. 270 List<string> a = new List<string>(setCookie); 271 for (int i = setCookie.Length - 1; i > 0; i--) 272 { 273 if (a[i].Substring(a[i].Length - 3) == "GMT") 274 { 275 a[i - 1] = a[i - 1] + ", " + a[i]; 276 a.RemoveAt(i); 277 i--; 278 } 279 } 280 setCookie = a.ToArray<string>(); 281 CookieCollection cookies = new CookieCollection(); 282 foreach (string str in setCookie) 283 { 284 NameValueCollection hs = new NameValueCollection(); 285 foreach (string i in str.Split(';')) 286 { 287 int index = i.IndexOf("="); 288 if (index > 0) 289 hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim()); 290 else 291 switch (i) 292 { 293 case "HttpOnly": 294 hs.Add("HttpOnly", "True"); 295 break; 296 case "Secure": 297 hs.Add("Secure", "True"); 298 break; 299 } 300 } 301 Cookie ck = new Cookie(); 302 foreach (string Key in hs.AllKeys) 303 { 304 switch (Key.ToLower().Trim()) 305 { 306 case "path": 307 ck.Path = hs[Key]; 308 break; 309 case "expires": 310 ck.Expires = DateTime.Parse(hs[Key]); 311 break; 312 case "domain": 313 ck.Domain = hs[Key]; 314 break; 315 case "httpOnly": 316 ck.HttpOnly = true; 317 break; 318 case "secure": 319 ck.Secure = true; 320 break; 321 default: 322 ck.Name = Key; 323 ck.Value = hs[Key]; 324 break; 325 } 326 } 327 if (ck.Domain == "") ck.Domain = defaultDomain; 328 if (ck.Name != "") cookies.Add(ck); 329 } 330 return cookies; 331 } 332 catch 333 { 334 return null; 335 } 336 } 337 338 /// <summary> 339 /// 遍歷CookieContainer 340 /// </summary> 341 /// <param name="cookieContainer"></param> 342 /// <returns>List of cookie</returns> 343 public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer) 344 { 345 Dictionary<string, string> cookies = new Dictionary<string, string>(); 346 347 Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable", 348 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField | 349 System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { }); 350 351 foreach (string pathList in table.Keys) 352 { 353 StringBuilder _cookie = new StringBuilder(); 354 SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list", 355 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField 356 | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { }); 357 foreach (CookieCollection colCookies in cookieColList.Values) 358 foreach (Cookie c in colCookies) 359 _cookie.Append(c.Name + "=" + c.Value + ";"); 360 361 cookies.Add(pathList, _cookie.ToString().TrimEnd(';')); 362 } 363 return cookies; 364 } 365 366 /// <summary> 367 /// convert cookies string to CookieContainer 368 /// </summary> 369 /// <param name="cookies"></param> 370 /// <returns></returns> 371 public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies) 372 { 373 CookieContainer cookieContainer = new CookieContainer(); 374 375 foreach (var cookie in cookies) 376 { 377 string[] strEachCookParts = cookie.Value.Split(';'); 378 int intEachCookPartsCount = strEachCookParts.Length; 379 380 foreach (string strCNameAndCValue in strEachCookParts) 381 { 382 if (!string.IsNullOrEmpty(strCNameAndCValue)) 383 { 384 Cookie cookTemp = new Cookie(); 385 int firstEqual = strCNameAndCValue.IndexOf("="); 386 string firstName = strCNameAndCValue.Substring(0, firstEqual); 387 string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1)); 388 cookTemp.Name = firstName; 389 cookTemp.Value = allValue; 390 cookTemp.Path = "/"; 391 cookTemp.Domain = cookie.Key; 392 cookieContainer.Add(cookTemp); 393 } 394 } 395 } 396 return cookieContainer; 397 } 398 399 public static string BuildPostData(string htmlContent) 400 { 401 HtmlDocument htmlDoc = new HtmlDocument(); 402 htmlDoc.LoadHtml(htmlContent); 403 //Get the form node collection. 404 HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form"); 405 HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input"); 406 407 StringBuilder postData = new StringBuilder(); 408 409 foreach (HtmlNode input in htmlInputs) 410 { 411 if(input.Attributes["value"] != null) 412 postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&"); 413 } 414 return postData.ToString().TrimEnd('&'); 415 } 416 } 417 }
部分網站需要登錄的問題我已經著手通過另一個項目來解決(imitate-login),目前還有許多網頁使用了JavaScript或各種基於JS的框架來對網頁進行數據加載,如何來模擬執行JavaScript暫時還沒找到比較優美的解決方案,如果大家有什麼好的方案可以發給我,謝謝!
未經授權,拒絕任何全文及摘要轉載!