[csharp] 包括8個按鈕,每個按鈕下的代碼都可運行(第5、6個可能需要調試一下)。 [csharp] 有基本的頁面抓取,不含分頁數據的; [csharp 有含分頁數據,且【下一頁】的鏈接是網址的; [csharp] 有含分頁數據,且【下一頁】的鏈接是<span style="font-size:12px;color:#000000;">__doPostBack;</span> [csharp] <span style="font-size:12px;">有含分頁數據,且【下一頁】的屬性是.gif,可通過F12找到href的。</span> [csharp] [csharp] 參考網址:<a href="http://www.cnblogs.com/ceachy/articles/CSharp_Retrive_Page_Document.html">http://www.cnblogs.com/ceachy/articles/CSharp_Retrive_Page_Document.html</a> [csharp] [csharp] <a href="http://www.cnblogs.com/ghfsusan/archive/2010/05/26/1744820.html">http://www.cnblogs.com/ghfsusan/archive/2010/05/26/1744820.html</a> [csharp] [csharp] [csharp] using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.IO; using System.Net; namespace WindowsFormsApplication1 { public partial class Form1 : Form { public Form1() { InitializeComponent(); } //private void button1_Click(object sender, EventArgs e) //{ // MessageBox.Show("hello world."); //} //WebBrowser web1 = new WebBrowser(); //web1.Navigate("http://www.xjflcp.com/ssc/"); //web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); //void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) //{ // WebBrowser web = (WebBrowser)sender; // HtmlElementCollection ElementCollection = web.Document.GetElementsByTagName("Table"); // foreach (HtmlElement item in ElementCollection) // { // File.AppendAllText("Kaijiang_xj.txt", item.InnerText); // } //} //根據Url地址得到網頁的html源碼 private string GetWebContent(string Url) { string strResult=""; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); //聲明一個HttpWebRequest請求 request.Timeout = 30000; //設置連接超時時間 request.Headers.Set("Pragma", "no-cache"); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream streamReceive = response.GetResponseStream(); Encoding encoding = Encoding.GetEncoding("GB2312"); StreamReader streamReader = new StreamReader(streamReceive, encoding); strResult = streamReader.ReadToEnd(); } catch { MessageBox.Show("出錯"); } return strResult; } //為了使用HttpWebRequest和HttpWebResponse,需填名字空間引用 using System.Net; //以下是程序具體實現過程: private void button1_Click(object sender, EventArgs e) { //要抓取的URL地址 string Url = "http://list.mp3.baidu.com/topso/mp3topsong.html?id=1#top2"; //得到指定Url的源碼 string strWebContent = GetWebContent(Url); richTextBox1.Text = strWebContent; //取出和數據有關的那段源碼 int iBodyStart = strWebContent.IndexOf("<body", 0); int iStart = strWebContent.IndexOf("歌曲TOP500", iBodyStart); int iTableStart = strWebContent.IndexOf("<table", iStart); int iTableEnd = strWebContent.IndexOf("</table>", iTableStart); string strWeb = strWebContent.Substring(iTableStart, iTableEnd - iTableStart + 8); //生成HtmlDocument WebBrowser webb = new WebBrowser(); webb.Navigate("about:blank"); HtmlDocument htmldoc = webb.Document.OpenNew(true); htmldoc.Write(strWeb); HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName("TR"); foreach (HtmlElement tr in htmlTR) { string strID = tr.GetElementsByTagName("TD")[0].InnerText; string strName = tr.GetElementsByTagName("TD")[1].InnerText; string strSinger = tr.GetElementsByTagName("TD")[1].InnerText; //插入DataTable strID = strID.Replace(".", ""); //AddLine(strID, strName, strSinger, "0"); //string strID = tr.GetElementsByTagName("TD")[0].InnerText; //string strName = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "MusicName"); //string strSinger = SplitName(tr.GetElementsByTagName("TD")[1].InnerText, "Singer"); ////插入DataTable //strID = strID.Replace(".", ""); //AddLine(strID, strName, strSinger, "0"); //string strID1 = tr.GetElementsByTagName("TD")[2].InnerText; //string strName1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "MusicName"); //string strSinger1 = SplitName(tr.GetElementsByTagName("TD")[3].InnerText, "Singer"); ////插入DataTable //strID1 = strID1.Replace(".", ""); //AddLine(strID1, strName1, strSinger1, "0"); //string strID2 = tr.GetElementsByTagName("TD")[4].InnerText; //string strName2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "MusicName"); //string strSinger2 = SplitName(tr.GetElementsByTagName("TD")[5].InnerText, "Singer"); ////插入DataTable //strID2 = strID2.Replace(".", ""); //AddLine(strID2, strName2, strSinger2, "0"); } //插入數據庫 //InsertData(dt); //dataGridView1.DataSource = dt.DefaultView; } private void button2_Click(object sender, EventArgs e) { try { WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設置用於向Internet資源的請求進行身份驗證的網絡憑據 Byte[] pageData = MyWebClient.DownloadData("http://bbs.cup.edu.cn/cupbbs/ThreadList.aspx?fid=51"); //從指定網站下載數據 //string pageHtml = Encoding.Default.GetString(pageData); //如果獲取網站頁面采用的是GB2312,則使用這句 string pageHtml = Encoding.UTF8.GetString(pageData); //如果獲取網站頁面采用的是UTF-8,則使用這句 Console.WriteLine(pageHtml);//在控制台輸入獲取的內容 using (StreamWriter sw = new StreamWriter("C:\\Users\\yuan\\Desktop\\ouput.html"))//將獲取的內容寫入文本 { sw.Write(pageHtml); } Console.ReadLine(); //讓控制台暫停,否則一閃而過了 } catch(WebException webEx) { Console.WriteLine(webEx.Message.ToString()); } } void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { WebBrowser web = (WebBrowser)sender; HtmlElementCollection ElementCollection = web.Document.GetElementsByTagName("Table"); foreach (HtmlElement item in ElementCollection) { File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton3.txt", item.InnerText); } } private void button3_Click(object sender, EventArgs e) { WebBrowser web = new WebBrowser(); web.Navigate("http://www.chinahighway.gov.cn/html/staticHtml/front/index_lkcx.html"); web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted); } private void button4_Click(object sender, EventArgs e) { string _StrResponse = ""; HttpWebRequest _WebRequest = (HttpWebRequest)WebRequest.Create("http://bbs.cup.edu.cn/cupbbs/ThreadList.aspx?fid=51"); _WebRequest.UserAgent = "MOZILLA/4.0 (COMPATIBLE; MSIE 7.0; WINDOWS NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"; _WebRequest.Method = "Get"; WebResponse _WebResponse = _WebRequest.GetResponse(); StreamReader _ResponseStream = new StreamReader(_WebResponse.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")); _StrResponse = _ResponseStream.ReadToEnd(); _WebResponse.Close(); _ResponseStream.Close(); File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton4.txt", _StrResponse); } private void button5_Click(object sender, EventArgs e) { System.Net.WebClient WebClientObj = new System.Net.WebClient(); System.Collections.Specialized.NameValueCollection PostVars = new System.Collections.Specialized.NameValueCollection(); PostVars.Add("__VIEWSTATE", "dDwxNDMyMjQ3NTY4O3Q8O2w8aTwxPjs+O2w8dDw7bDxpPDM+O2k8Nz47PjtsPHQ8O2w8aTwwPjtpPDE+O2k8Mj47aTwzPjtpP........省略了,太長了......."); //PostVars.Add("__EVENTVALIDATION", "此處是您需要提前得到的信息"); PostVars.Add("__EVENTTARGET", "grdThreadList:_ctl27:_ctl3"); //PostVars.Add("__EVENTARGUMENT", ""); //string s = "23$74$56"; //string[] str=new string[5]; //str=s.Split('$'); WebClientObj.Headers.Add("ContentType", "application/x-www-form-urlencoded"); try { byte[] byte1 = WebClientObj.UploadValues("http://bbs.cup.edu.cn/cupbbs/ThreadList.aspx?fid=51", "POST", PostVars); string ResponseStr = Encoding.UTF8.GetString(byte1); //得到當前頁面對應的html 文本字符串 //GetPostValue(ResponseStr);//得到當前頁面對應的 __VIEWSTATE 等上面需要的信息,為抓取下一頁面使用 //SaveMessage(ResponseStr);//保存自己關心的內容到數據庫中 File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton5.txt", ResponseStr); } catch (Exception ex) { Console.WriteLine(ex.Message); } } private static string current__viewstate="";//保存當前頁面對應的 __VIEWSTATE 等上面需要的信息,為再次點擊按鈕(抓取下一頁面)使用 private string GetPostValue(string ResponseStr) { //...... return "";//略//解析ResponseStr,得到__VIEWSTATE的值 } private void button6_Click(object sender, EventArgs e) { System.Net.WebClient WebClientObj = new System.Net.WebClient(); System.Collections.Specialized.NameValueCollection PostVars = new System.Collections.Specialized.NameValueCollection(); PostVars.Add("__VIEWSTATE", current__viewstate); ////PostVars.Add("__EVENTVALIDATION", "此處是您需要提前得到的信息"); PostVars.Add("__EVENTTARGET", "grdThreadList:_ctl27:_ctl3");//通過for,改變其值,ctl1,ctl2,ctl3.... ////PostVars.Add("__EVENTARGUMENT", ""); ////string s = "23$74$56"; ////string[] str=new string[5]; ////str=s.Split('$'); WebClientObj.Headers.Add("ContentType", "application/x-www-form-urlencoded"); try { byte[] byte1 = WebClientObj.UploadValues("http://www.chinahighway.gov.cn/html/staticHtml/front/index_lkcx.html", "POST", PostVars); string ResponseStr = Encoding.UTF8.GetString(byte1); //得到當前頁面對應的html 文本字符串 current__viewstate=GetPostValue(ResponseStr);//得到當前頁面對應的 __VIEWSTATE 等上面需要的信息,為抓取下一頁面使用 //SaveMessage(ResponseStr);//保存自己關心的內容到數據庫中 File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton6.txt", ResponseStr); } catch (Exception ex) { Console.WriteLine(ex.Message); } } private void button7_Click(object sender, EventArgs e) { try { WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設置用於向Internet資源的請求進行身份驗證的網絡憑據 Byte[] pageData = MyWebClient.DownloadData("http://gb.cri.cn/42071/2013/03/05/3245s4038640_2.htm"); //從指定網站下載數據 string pageHtml = Encoding.Default.GetString(pageData); //如果獲取網站頁面采用的是GB2312,則使用這句 //string pageHtml = Encoding.UTF8.GetString(pageData); //如果獲取網站頁面采用的是UTF-8,則使用這句 Console.WriteLine(pageHtml);//在控制台輸入獲取的內容 using (StreamWriter sw = new StreamWriter("C:\\Users\\yuan\\Desktop\\ouput7.html"))//將獲取的內容寫入文本 { sw.Write(pageHtml); } Console.ReadLine(); //讓控制台暫停,否則一閃而過了 } catch (WebException webEx) { Console.WriteLine(webEx.Message.ToString()); } } //對分頁數據抓取,當網頁上【下一頁】的屬性是.gif格式的時候,而且鼠標停在【下一頁】上面,頁面下面顯示的網址不完整(網址太長,只能看到部分), //可以通過在網頁上點擊右鍵【查看源文件】,或F12【腳本】,找到對應的href,即可得到網址。 private void button8_Click(object sender, EventArgs e) { try { WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設置用於向Internet資源的請求進行身份驗證的網絡憑據 Byte[] pageData = MyWebClient.DownloadData("http://www.chinahighway.gov.cn/roadInfo/queryRoadInfo.do?queryType=map&startDate=&cantonName=&cantonCode=&infoType=3&endDate=&startPlanDate=&_page_size=50&roadName=&mapList=-1&endRealDate=&roadCode=&provinceList=-1&endPlanDate=&startRealDate=&page=3"); //從指定網站下載數據 string pageHtml = Encoding.Default.GetString(pageData); //如果獲取網站頁面采用的是GB2312,則使用這句 //string pageHtml = Encoding.UTF8.GetString(pageData); //如果獲取網站頁面采用的是UTF-8,則使用這句 Console.WriteLine(pageHtml);//在控制台輸入獲取的內容 using (StreamWriter sw = new StreamWriter("C:\\Users\\yuan\\Desktop\\ouput8.txt"))//將獲取的內容寫入文本 www.2cto.com { sw.Write(pageHtml); } Console.ReadLine(); //讓控制台暫停,否則一閃而過了 } catch (WebException webEx) { Console.WriteLine(webEx.Message.ToString()); } } } }