本文實例為大家分享了基於C#實現網頁爬蟲的詳細代碼,供大家參考,具體內容如下
HTTP請求工具類:
功能:
1、獲取網頁Html
2、下載網絡圖片
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73using
System;
using
System.Collections.Generic;
using
System.IO;
using
System.Linq;
using
System.Net;
using
System.Text;
using
System.Threading.Tasks;
using
System.Windows.Forms;
namespace
Utils
{
/// <summary>
/// HTTP請求工具類
/// </summary>
public
class
HttpRequestUtil
{
/// <summary>
/// 獲取頁面Html
/// </summary>
public
static
string
GetPageHtml(
string
url)
{
// 設置參數
HttpWebRequest request = WebRequest.Create(url)
as
HttpWebRequest;
request.UserAgent =
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
;
//發送請求並獲取相應回應數據
HttpWebResponse response = request.GetResponse()
as
HttpWebResponse;
//直到request.GetResponse()程序才開始向目標網頁發送Post請求
Stream responseStream = response.GetResponseStream();
StreamReader sr =
new
StreamReader(responseStream, Encoding.UTF8);
//返回結果網頁(Html)代碼
string
content = sr.ReadToEnd();
return
content;
}
/// <summary>
/// Http下載文件
/// </summary>
public
static
void
HttpDownloadFile(
string
url)
{
int
pos = url.LastIndexOf(
"/"
) + 1;
string
fileName = url.Substring(pos);
string
path = Application.StartupPath +
"\\download"
;
if
(!Directory.Exists(path))
{
Directory.CreateDirectory(path);
}
string
filePathName = path +
"\\"
+ fileName;
if
(File.Exists(filePathName))
return
;
// 設置參數
HttpWebRequest request = WebRequest.Create(url)
as
HttpWebRequest;
request.UserAgent =
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
;
request.Proxy =
null
;
//發送請求並獲取相應回應數據
HttpWebResponse response = request.GetResponse()
as
HttpWebResponse;
//直到request.GetResponse()程序才開始向目標網頁發送Post請求
Stream responseStream = response.GetResponseStream();
//創建本地文件寫入流
Stream stream =
new
FileStream(filePathName, FileMode.Create);
byte
[] bArr =
new
byte
[1024];
int
size = responseStream.Read(bArr, 0, (
int
)bArr.Length);
while
(size > 0)
{
stream.Write(bArr, 0, size);
size = responseStream.Read(bArr, 0, (
int
)bArr.Length);
}
stream.Close();
responseStream.Close();
}
}
}
多線程爬取網頁代碼:
using
System;
using
System.Collections.Generic;
using
System.ComponentModel;
using
System.Data;
using
System.Drawing;
using
System.IO;
using
System.Linq;
using
System.Text;
using
System.Text.RegularExpressions;
using
System.Threading;
using
System.Threading.Tasks;
using
System.Windows.Forms;
using
Utils;
namespace
爬蟲
{
public
partial
class
Form1 : Form
{
List<Thread> threadList =
new
List<Thread>();
Thread thread =
null
;
public
Form1()
{
InitializeComponent();
}
private
void
button1_Click(
object
sender, EventArgs e)
{
DateTime dtStart = DateTime.Now;
button3.Enabled =
true
;
button2.Enabled =
true
;
button1.Enabled =
false
;
int
page = 0;
int
count = 0;
int
personCount = 0;
lblPage.Text =
"已完成頁數:0"
;
int
index = 0;
for
(
int
i = 1; i <= 10; i++)
{
thread =
new
Thread(
new
ParameterizedThreadStart(
delegate
(
object
obj)
{
for
(
int
j = 1; j <= 10; j++)
{
try
{
index = (Convert.ToInt32(obj) - 1) * 10 + j;
string
pageHtml = HttpRequestUtil.GetPageHtml(
"http://tt.mop.com/c44/0/1_"
+ index.ToString() +
".Html"
);
Regex regA =
new
Regex(
"<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\""
);
Regex regImg =
new
Regex(
"<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\""
);
MatchCollection mc = regA.Matches(pageHtml);
foreach
(Match match
in
mc)
{
int
start = match.ToString().IndexOf(
"href=\""
);
string
url = match.ToString().Substring(start + 6);
int
end = url.IndexOf(
"\""
);
url = url.Substring(0, end);
if
(url.IndexOf(
"/"
) == 0)
{
string
imgPageHtml = HttpRequestUtil.GetPageHtml(
"http://tt.mop.com"
+ url);
personCount++;
lblPerson.Invoke(
new
Action(
delegate
() { lblPerson.Text =
"已完成條數:"
+ personCount.ToString(); }));
MatchCollection mcImgPage = regImg.Matches(imgPageHtml);
foreach
(Match matchImgPage
in
mcImgPage)
{
start = matchImgPage.ToString().IndexOf(
"src=\""
);
string
imgUrl = matchImgPage.ToString().Substring(start + 5);
end = imgUrl.IndexOf(
"\""
);
imgUrl = imgUrl.Substring(0, end);
if
(imgUrl.IndexOf(
"http://i1"
) == 0)
{
try
{
HttpRequestUtil.HttpDownloadFile(imgUrl);
count++;
lblNum.Invoke(
new
Action(
delegate
()
{
lblNum.Text =
"已下載圖片數"
+ count.ToString();
DateTime dt = DateTime.Now;
double
time = dt.Subtract(dtStart).TotalSeconds;
if
(time > 0)
{
lblSpeed.Text =
"速度:"
+ (count / time).ToString(
"0.0"
) +
"張/秒"
;
}
}));
}
catch
{ }
Thread.Sleep(1);
}
}
}
}
}
catch
{ }
page++;
lblPage.Invoke(
new
Action(
delegate
() { lblPage.Text =
"已完成頁數:"
+ page.ToString(); }));
if
(page == 100)
{
button1.Invoke(
new
Action(
delegate
() { button1.Enabled =
true
; }));
MessageBox.Show(
"完成!"
);
}
}
}));
thread.Start(i);
threadList.Add(thread);
}
}
private
void
button2_Click(
object
sender, EventArgs e)
{
button1.Invoke(
new
Action(
delegate
()
{
foreach
(Thread thread
in
threadList)
{
if
(thread.ThreadState == ThreadState.Suspended)
{
thread.Resume();
}
thread.Abort();
}
button1.Enabled =
true
;
button2.Enabled =
false
;
button3.Enabled =
false
;
button4.Enabled =
false
;
}));
}
private
void
Form1_FormClosing(
object
sender, FormClosingEventArgs e)
{
foreach
(Thread thread
in
threadList)
{
thread.Abort();
}
}
private
void
button3_Click(
object
sender, EventArgs e)
{
foreach
(Thread thread
in
threadList)
{
if
(thread.ThreadState == ThreadState.Running)
{
thread.Suspend();
}
}
button3.Enabled =
false
;
button4.Enabled =
true
;
}
private
void
button4_Click(
object
sender, EventArgs e)
{
foreach
(Thread thread
in
threadList)
{
if
(thread.ThreadState == ThreadState.Suspended)
{
thread.Resume();
}
}
button3.Enabled =
true
;
button4.Enabled =
false
;
}
}
}
截圖:
以上就是本文的全部內容,希望對大家的學習有所幫助。