程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> C#入門知識 >> dotNet使用HttpWebRequest模擬浏覽器,模擬手機浏覽器

dotNet使用HttpWebRequest模擬浏覽器,模擬手機浏覽器

編輯:C#入門知識

dotNet使用HttpWebRequest模擬浏覽器,模擬手機浏覽器


在編寫網絡爬蟲時,HttpWebRequest幾乎可以完成絕大多數網站的抓取,為了更好的使用這一技術,我將常用的幾個功能進行了封裝,以方便調用。這個類已經在多個項目中得到使用,主要解決了Cookies相關的一些問題;如果有其它方面的問題可以提出來,我會進一步完善。

目前HttpHelper包含了以下幾個方面:

  • GetHttpContent:通過Get或Post來獲取網頁的Html
  • SetCookie:根據response中頭部的set-cookie對cookie進行設置,能識別httponly
  • GetAllCookies:將CookieContainer轉換為鍵值對,方便存儲和跨程序間調用
  • ConvertToCookieContainer:將鍵值對轉換回CookieContainer供程序調用
  • BuildPostData:通過一個需要post的html構建出postdata

代碼如下:

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Collections.Specialized;
  4 using System.IO;
  5 using System.IO.Compression;
  6 using System.Linq;
  7 using System.Net;
  8 using System.Net.Security;
  9 using System.Security.Cryptography.X509Certificates;
 10 using System.Text;
 11 using System.Text.RegularExpressions;
 12 using System.Collections;
 13 using HtmlAgilityPack;
 14 
 15 namespace TNIdea.Common.Helper
 16 {
 17     public class HttpHelper
 18     {
 19         public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^\s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^\s"">]+)""?)";
 20 
 21         /// <summary>
 22         /// 獲取網頁的內容
 23         /// </summary>
 24         /// <param name="url">Url</param>
 25         /// <param name="postData">Post的信息</param>
 26         /// <param name="cookies">Cookies</param>
 27         /// <param name="userAgent">浏覽器標識</param>
 28         /// <param name="referer">來源頁</param>
 29         /// <param name="cookiesDomain">Cookies的Domian參數,配合cookies使用;為空則取url的Host</param>
 30         /// <param name="encode">編碼方式,用於解析html</param>
 31         /// <returns></returns>
 32         public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null)
 33         {
 34             try
 35             {
 36                 HttpWebResponse httpResponse = null;
 37                 if (!string.IsNullOrWhiteSpace(postData))
 38                     httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer);
 39                 else
 40                     httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer);
 41 
 42                 #region 根據Html頭判斷
 43                 string Content = null;
 44                 //緩沖區長度
 45                 const int N_CacheLength = 10000;
 46                 //頭部預讀取緩沖區,字節形式
 47                 var bytes = new List<byte>();
 48                 int count = 0;
 49                 //頭部預讀取緩沖區,字符串
 50                 String cache = string.Empty;
 51 
 52                 //創建流對象並解碼
 53                 Stream ResponseStream;
 54                 switch (httpResponse.ContentEncoding.ToUpperInvariant())
 55                 {
 56                     case "GZIP":
 57                         ResponseStream = new GZipStream(
 58                             httpResponse.GetResponseStream(), CompressionMode.Decompress);
 59                         break;
 60                     case "DEFLATE":
 61                         ResponseStream = new DeflateStream(
 62                             httpResponse.GetResponseStream(), CompressionMode.Decompress);
 63                         break;
 64                     default:
 65                         ResponseStream = httpResponse.GetResponseStream();
 66                         break;
 67                 }
 68 
 69                 try
 70                 {
 71                     while (
 72                         !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase)
 73                           || count >= N_CacheLength))
 74                     {
 75                         var b = (byte)ResponseStream.ReadByte();
 76                         if (b < 0) //end of stream
 77                         {
 78                             break;
 79                         }
 80                         bytes.Add(b);
 81 
 82                         count++;
 83                         cache += (char)b;
 84                     }
 85 
 86 
 87                     if (encode == null)
 88                     {
 89                         try
 90                         {
 91                             if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn")
 92                             {
 93                                 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline);
 94                                 if (match.Success)
 95                                 {
 96                                     try
 97                                     {
 98                                         string charset = match.Groups["Charset"].Value;
 99                                         encode = Encoding.GetEncoding(charset);
100                                     }
101                                     catch { }
102                                 }
103                                 else
104                                     encode = Encoding.GetEncoding("GB2312");
105                             }
106                             else
107                                 encode = Encoding.GetEncoding(httpResponse.CharacterSet);
108                         }
109                         catch { }
110                     }
111 
112                     //緩沖字節重新編碼,然後再把流讀完
113                     var Reader = new StreamReader(ResponseStream, encode);
114                     Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();
115                     Reader.Close();
116                 }
117                 catch (Exception ex)
118                 {
119                     return ex.ToString();
120                 }
121                 finally
122                 {
123                     httpResponse.Close();
124                 }
125                 #endregion 根據Html頭判斷
126 
127                 //獲取返回的Cookies,支持httponly
128                 if (string.IsNullOrWhiteSpace(cookiesDomain))
129                     cookiesDomain = httpResponse.ResponseUri.Host;
130 
131                 cookies = new CookieContainer();
132                 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);
133                 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);
134 
135                 return Content;
136             }
137             catch
138             {
139                 return string.Empty;
140             }
141         }
142 
143 
144         /// <summary>
145         /// 創建GET方式的HTTP請求 
146         /// </summary>
147         /// <param name="url"></param>
148         /// <param name="timeout"></param>
149         /// <param name="userAgent"></param>
150         /// <param name="cookies"></param>
151         /// <param name="referer"></param>
152         /// <returns></returns>
153         public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
154         {
155             HttpWebRequest request = null;
156             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
157             {
158                 //對服務端證書進行有效性校驗(非第三方權威機構頒發的證書,如自己生成的,不進行驗證,這裡返回true)
159                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
160                 request = WebRequest.Create(url) as HttpWebRequest;
161                 //request.ProtocolVersion = HttpVersion.Version10;    //http版本,默認是1.1,這裡設置為1.0
162             }
163             else
164             {
165                 request = WebRequest.Create(url) as HttpWebRequest;
166             }
167 
168             request.Referer = referer;
169             request.Method = "GET";
170 
171             //設置代理UserAgent和超時
172             if (string.IsNullOrWhiteSpace(userAgent))
173                 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36";
174 
175             request.UserAgent = userAgent;
176             request.Timeout = timeout;
177             request.KeepAlive = true;
178             request.AllowAutoRedirect = true;
179 
180             if (cookies == null)
181                 cookies = new CookieContainer();
182             request.CookieContainer = cookies;
183 
184             return request.GetResponse() as HttpWebResponse;
185         }
186 
187         /// <summary>
188         /// 創建POST方式的HTTP請求
189         /// </summary>
190         /// <param name="url"></param>
191         /// <param name="postData"></param>
192         /// <param name="timeout"></param>
193         /// <param name="userAgent"></param>
194         /// <param name="cookies"></param>
195         /// <param name="referer"></param>
196         /// <returns></returns>
197         public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
198         {
199             HttpWebRequest request = null;
200             //如果是發送HTTPS請求  
201             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
202             {
203                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
204                 request = WebRequest.Create(url) as HttpWebRequest;
205                 //request.ProtocolVersion = HttpVersion.Version10;
206             }
207             else
208             {
209                 request = WebRequest.Create(url) as HttpWebRequest;
210             }
211             request.Referer = referer;
212             request.Method = "POST";
213             request.ContentType = "application/x-www-form-urlencoded";
214 
215             //設置代理UserAgent和超時
216             if (string.IsNullOrWhiteSpace(userAgent))
217                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";
218             else
219                 request.UserAgent = userAgent;
220             request.Timeout = timeout;
221             request.KeepAlive = true;
222             request.AllowAutoRedirect = true;
223 
224             if (cookies == null)
225                 cookies = new CookieContainer();
226             request.CookieContainer = cookies;
227 
228             //發送POST數據  
229             if (!string.IsNullOrWhiteSpace(postData))
230             {
231                 byte[] data = Encoding.UTF8.GetBytes(postData);
232                 request.ContentLength = data.Length;
233                 using (Stream stream = request.GetRequestStream())
234                 {
235                     stream.Write(data, 0, data.Length);
236                 }
237             }
238             //string[] values = request.Headers.GetValues("Content-Type");
239             return request.GetResponse() as HttpWebResponse;
240         }
241 
242         /// <summary>
243         /// 驗證證書
244         /// </summary>
245         /// <param name="sender"></param>
246         /// <param name="certificate"></param>
247         /// <param name="chain"></param>
248         /// <param name="errors"></param>
249         /// <returns>是否驗證通過</returns>
250         private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
251         {
252             if (errors == SslPolicyErrors.None)
253                 return true;
254             return false;
255         }
256 
257         /// <summary>
258         /// 根據response中頭部的set-cookie對request中的cookie進行設置
259         /// </summary>
260         /// <param name="setCookie">The set cookie.</param>
261         /// <param name="defaultDomain">The default domain.</param>
262         /// <returns></returns>
263         private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)
264         {
265             try
266             {
267                 string[] setCookie = response.Headers.GetValues("Set-Cookie");
268 
269                 // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.
270                 List<string> a = new List<string>(setCookie);
271                 for (int i = setCookie.Length - 1; i > 0; i--)
272                 {
273                     if (a[i].Substring(a[i].Length - 3) == "GMT")
274                     {
275                         a[i - 1] = a[i - 1] + ", " + a[i];
276                         a.RemoveAt(i);
277                         i--;
278                     }
279                 }
280                 setCookie = a.ToArray<string>();
281                 CookieCollection cookies = new CookieCollection();
282                 foreach (string str in setCookie)
283                 {
284                     NameValueCollection hs = new NameValueCollection();
285                     foreach (string i in str.Split(';'))
286                     {
287                         int index = i.IndexOf("=");
288                         if (index > 0)
289                             hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim());
290                         else
291                             switch (i)
292                             {
293                                 case "HttpOnly":
294                                     hs.Add("HttpOnly", "True");
295                                     break;
296                                 case "Secure":
297                                     hs.Add("Secure", "True");
298                                     break;
299                             }
300                     }
301                     Cookie ck = new Cookie();
302                     foreach (string Key in hs.AllKeys)
303                     {
304                         switch (Key.ToLower().Trim())
305                         {
306                             case "path":
307                                 ck.Path = hs[Key];
308                                 break;
309                             case "expires":
310                                 ck.Expires = DateTime.Parse(hs[Key]);
311                                 break;
312                             case "domain":
313                                 ck.Domain = hs[Key];
314                                 break;
315                             case "httpOnly":
316                                 ck.HttpOnly = true;
317                                 break;
318                             case "secure":
319                                 ck.Secure = true;
320                                 break;
321                             default:
322                                 ck.Name = Key;
323                                 ck.Value = hs[Key];
324                                 break;
325                         }
326                     }
327                     if (ck.Domain == "") ck.Domain = defaultDomain;
328                     if (ck.Name != "") cookies.Add(ck);
329                 }
330                 return cookies;
331             }
332             catch
333             {
334                 return null;
335             }
336         }
337 
338         /// <summary>
339         /// 遍歷CookieContainer
340         /// </summary>
341         /// <param name="cookieContainer"></param>
342         /// <returns>List of cookie</returns>
343         public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)
344         {
345             Dictionary<string, string> cookies = new Dictionary<string, string>();
346 
347             Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",
348                 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |
349                 System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { });
350 
351             foreach (string pathList in table.Keys)
352             {
353                 StringBuilder _cookie = new StringBuilder();
354                 SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",
355                     System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField
356                     | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });
357                 foreach (CookieCollection colCookies in cookieColList.Values)
358                     foreach (Cookie c in colCookies)
359                         _cookie.Append(c.Name + "=" + c.Value + ";");
360 
361                 cookies.Add(pathList, _cookie.ToString().TrimEnd(';'));
362             }
363             return cookies;
364         }
365 
366         /// <summary>
367         /// convert cookies string to CookieContainer
368         /// </summary>
369         /// <param name="cookies"></param>
370         /// <returns></returns>
371         public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)
372         {
373             CookieContainer cookieContainer = new CookieContainer();
374 
375             foreach (var cookie in cookies)
376             {
377                 string[] strEachCookParts = cookie.Value.Split(';');
378                 int intEachCookPartsCount = strEachCookParts.Length;
379 
380                 foreach (string strCNameAndCValue in strEachCookParts)
381                 {
382                     if (!string.IsNullOrEmpty(strCNameAndCValue))
383                     {
384                         Cookie cookTemp = new Cookie();
385                         int firstEqual = strCNameAndCValue.IndexOf("=");
386                         string firstName = strCNameAndCValue.Substring(0, firstEqual);
387                         string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
388                         cookTemp.Name = firstName;
389                         cookTemp.Value = allValue;
390                         cookTemp.Path = "/";
391                         cookTemp.Domain = cookie.Key;
392                         cookieContainer.Add(cookTemp);
393                     }
394                 }
395             }
396             return cookieContainer;
397         }
398 
399         public static string BuildPostData(string htmlContent)
400         {
401             HtmlDocument htmlDoc = new HtmlDocument();
402             htmlDoc.LoadHtml(htmlContent);
403             //Get the form node collection.
404             HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");
405             HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input");
406 
407             StringBuilder postData = new StringBuilder();
408 
409             foreach (HtmlNode input in htmlInputs)
410             {
411                 if(input.Attributes["value"] != null)
412                     postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");
413             }
414             return postData.ToString().TrimEnd('&');
415         }
416     }
417 }

部分網站需要登錄的問題我已經著手通過另一個項目來解決(imitate-login),目前還有許多網頁使用了JavaScript或各種基於JS的框架來對網頁進行數據加載,如何來模擬執行JavaScript暫時還沒找到比較優美的解決方案,如果大家有什麼好的方案可以發給我,謝謝!

 未經授權,拒絕任何全文及摘要轉載!

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved