程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> .NET網頁編程 >> C# >> C#入門知識 >> c#寬度優先的網絡爬蟲

c#寬度優先的網絡爬蟲

編輯:C#入門知識

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Tool;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading;

namespace Search
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        /**
        * 隊列,保存將要訪問的URL
        */
        public class Queue
        {
            //使用鏈表實現隊列
            private LinkedList queue = new LinkedList();
            //入隊列
            public void enQueue(string t)
            {
                queue.AddLast(t);
            }
            //出隊列
            public string deQueue()
            {
                string o = queue.Last.Value;
                 queue.RemoveLast();
                 return o;

            }
            //判斷隊列是否為空
            public bool isQueueEmpty()
            {
                return queue.Count > 0 ? false : true;
            }
            //判斷隊列是否包含t
            public bool contians(string t)
            {
                return queue.Contains(t);
            }
            public int getcount()
            {
                return queue.Count;
            }
        }
        public class LinkQueue
        {
            //已訪問的url 集合
            private static ISet visitedUrl = new HashSet();
            //待訪問的url 集合
            private static Queue unVisitedUrl = new Queue();
            //獲得URL 隊列
            public static Queue getUnVisitedUrl()
            {
                return unVisitedUrl;
            }
            //添加到訪問過的URL 隊列中
            public static void addVisitedUrl(String url)
            {
                visitedUrl.Add(url);
            }
            //移除訪問過的URL
            public static void removeVisitedUrl(String url)
            {
                visitedUrl.Remove(url);
            }
            //未訪問的URL 出隊列
            public static Object unVisitedUrlDeQueue()
            {
                return unVisitedUrl.deQueue();
            }
            // 保證每個URL 只被訪問一次
            public static void addUnvisitedUrl(String url)
            {
                if (url != null && !url.Trim().Equals("")
                && !visitedUrl.Contains(url)
                && !unVisitedUrl.contians(url))
                    unVisitedUrl.enQueue(url);
            }
            //獲得已經訪問的URL 數目
            public static int getVisitedUrlNum()
            {
                return visitedUrl.Count;
            }
            //判斷未訪問的URL 隊列中是否為空
            public static bool unVisitedUrlsEmpty()
            {
                return unVisitedUrl.isQueueEmpty();
            }
        }


        string[] urlarr=new string[100];
        private void button1_Click(object sender, EventArgs e)
        {
            zzHttp http = new zzHttp();
            CookieContainer cookie = new CookieContainer();
            string url = textBox1.Text!=""?textBox1.Text:"http://image.baidu.com/";
            string content=http.SendDataByGET(url,"",ref cookie);

            string baseUri = Utility.GetBaseUri(url);
            string[] links = Parser.ExtractLinks(baseUri, content);
            foreach (string link in links)
            {
                richTextBox1.Text += link;
                richTextBox1.Text += "\n";
            }


            Regex regImg = new Regex(@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);            // 搜索匹配的字符串            
            MatchCollection matches = regImg.Matches(content);            

            Queue que = new Queue();
            foreach (Match match in matches)
                que.enQueue(match.Groups["imgUrl"].Value);
            int k;
            for (k = 0; k < que.getcount(); k++)
            {
                string picurl = que.deQueue();
                richTextBox1.Text += picurl;
                richTextBox1.Text += "\n";

                string[] s = picurl.Split('/');
                string picname=s[s.Length - 1];
                zzHttp.downfile(picurl, picname, @"d:\pic\");
            }
            label1.Text = k+"張";
        }

        //搜索
        void search()
        {
            int i = 0;
            LinkQueue.addUnvisitedUrl("http://blog.csdn.net/zhujunxxxxx/");
            while (!LinkQueue.unVisitedUrlsEmpty()
            && LinkQueue.getVisitedUrlNum() <= 1000)
            {
                
                //隊頭URL 出隊列
                String visitUrl=(String)LinkQueue.unVisitedUrlDeQueue();
                if(visitUrl==null)
                    continue;
                zzHttp downLoader = new zzHttp();
                CookieContainer cookie = new CookieContainer();
                 //下載網頁
                string content=downLoader.SendDataByGET(visitUrl,"",ref cookie);
                //該URL 放入已訪問的URL 中
                LinkQueue.addVisitedUrl(visitUrl);
                //提取出下載網頁中的URL
                string baseUri = Utility.GetBaseUri(visitUrl);
                string[] links = Parser.ExtractLinks(baseUri, content);
                //新的未訪問的URL 入隊
                i++;
                Add2Message("已訪問數目:" + LinkQueue.getVisitedUrlNum() + ",count=" + LinkQueue.getUnVisitedUrl().getcount());
                foreach (string link in links)
                {
                    if (link.Contains("css") || link.Contains("js") || link.Contains("gif") || link.Contains("jpg") || link.Contains("png") || link.Contains("jpeg"))
                        continue;
                    LinkQueue.addUnvisitedUrl(link);
                    AddMessage(link);
                }
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            
          new Thread(search).Start();
        }

        private delegate void InfoDelegate(string message);
        public void AddMessage(string message)
        {
            if (richTextBox1.InvokeRequired)//不能訪問就創建委托
            {
                InfoDelegate d = new InfoDelegate(AddMessage);
                richTextBox1.Invoke(d, new object[] { message});
            }
            else
            {
                richTextBox1.AppendText(message + Environment.NewLine);
                richTextBox1.ScrollToCaret();
            }
        }
        private delegate void Info2Delegate(string message);
        public void Add2Message(string message)
        {

            if (label2.InvokeRequired)//不能訪問就創建委托
            {
                Info2Delegate d = new Info2Delegate(Add2Message);
                label2.Invoke(d, new object[] { message });
            }
            else
            {
                label2.Text = message;
            }
        }
    }
}

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved