C#實現網頁爬蟲

HTTP請求工具類(功能:一、獲取網頁html;二、下載網絡圖片;):javascript

using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// HTTP請求工具類
    /// </summary>
    public class HttpRequestUtil
    {
        /// <summary>
        /// 獲取頁面html
        /// </summary>
        public static string GetPageHtml(string url)
        {
            // 設置參數
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            //發送請求並獲取相應迴應數據
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            //直到request.GetResponse()程序纔開始向目標網頁發送Post請求
            Stream responseStream = response.GetResponseStream();
            StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
            //返回結果網頁(html)代碼
            string content = sr.ReadToEnd();
            return content;
        }

        /// <summary>
        /// Http下載文件
        /// </summary>
        public static void HttpDownloadFile(string url, int minWidth, int minHeight)
        {
            int pos = url.LastIndexOf("/") + 1;
            string fileName = url.Substring(pos);
            string path = Application.StartupPath + "\\download";
            if (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
            }
            string filePathName = path + "\\" + fileName;
            if (File.Exists(filePathName)) return;

            // 設置參數
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            request.Proxy = null;
            //發送請求並獲取相應迴應數據
            HttpWebResponse response = request.GetResponse() as HttpWebResponse;
            //直到request.GetResponse()程序纔開始向目標網頁發送Post請求
            Stream responseStream = response.GetResponseStream();

            MemoryStream memoryStream = new MemoryStream();
            byte[] bArr = new byte[1024];
            int size = responseStream.Read(bArr, 0, (int)bArr.Length);
            while (size > 0)
            {
                memoryStream.Write(bArr, 0, size);
                size = responseStream.Read(bArr, 0, (int)bArr.Length);
            }
            Image tempImage = System.Drawing.Image.FromStream(memoryStream, true);
            int imageHeight = tempImage.Height;
            int imageWidth = tempImage.Width;
            if (imageHeight >= minHeight && imageWidth >= minWidth)
            {
                memoryStream.Seek(0, SeekOrigin.Begin);
                size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                FileStream fs = new FileStream(filePathName, FileMode.Create);
                while (size > 0)
                {
                    fs.Write(bArr, 0, size);
                    size = memoryStream.Read(bArr, 0, (int)bArr.Length);
                }
                fs.Close();
            }
            memoryStream.Close();
            responseStream.Close();
        }
    }
}
View Code

VisitedHelper類:html

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// 已訪問的網址列表
    /// </summary>
    public class VisitedHelper
    {
        private static List<string> m_VisitedList = new List<string>();

        #region 判斷是否已訪問
        /// <summary>
        /// 判斷是否已訪問
        /// </summary>
        public static bool IsVisited(string url)
        {
            if (m_VisitedList.Exists(a => a == url))
            {
                return true;
            }
            return false;
        }
        #endregion

        #region 添加已訪問
        /// <summary>
        /// 添加已訪問
        /// </summary>
        public static void Add(string url)
        {
            m_VisitedList.Add(url);
        }
        #endregion

    }
}
View Code

多線程爬取網頁代碼:java

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;

namespace 爬蟲
{
    public partial class Form1 : Form
    {
        private static int m_MinWidth = 300;
        private static int m_MinHeight = 300;
        private static int m_CompletedCount = 0;

        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            ThreadPool.SetMaxThreads(100, 100);
            int.TryParse(txtMinWidth.Text, out m_MinWidth);
            int.TryParse(txtMinHeight.Text, out m_MinHeight);
            button1.Enabled = false;
            lblMsg.Text = "正在爬取圖片…";
            timer1.Start();
            new Thread(new ThreadStart(delegate()
            {
                Crawling(txtUrl.Text, null);
            })).Start();
        }

        /// <summary>
        /// 爬取
        /// </summary>
        private void Crawling(string url, string host)
        {
            if (!VisitedHelper.IsVisited(url))
            {
                VisitedHelper.Add(url);

                if (host == null)
                {
                    host = GetHost(url);
                }

                string pageHtml = HttpRequestUtil.GetPageHtml(url);
                Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>", RegexOptions.IgnoreCase);
                Regex regImg = new Regex(@"<img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>", RegexOptions.IgnoreCase);

                MatchCollection mcImg = regImg.Matches(pageHtml);
                foreach (Match mImg in mcImg)
                {
                    string imageUrl = mImg.Groups[1].Value;
                    try
                    {
                        int imageWidth = GetImageWidthOrHeight(mImg.Value, true);
                        int imageHeight = GetImageWidthOrHeight(imageUrl, false);
                        if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight)
                        {
                            if (imageUrl.IndexOf("javascript") == -1)
                            {
                                if (imageUrl.IndexOf("http") == 0)
                                {
                                    HttpRequestUtil.HttpDownloadFile(imageUrl, m_MinWidth, m_MinHeight);
                                }
                                else
                                {
                                    HttpRequestUtil.HttpDownloadFile(host + imageUrl, m_MinWidth, m_MinHeight);
                                }
                            }
                        }
                    }
                    catch { }
                }

                //遞歸遍歷
                MatchCollection mcA = regA.Matches(pageHtml);
                foreach (Match mA in mcA)
                {
                    try
                    {
                        string nextUrl = mA.Groups[1].Value;
                        if (nextUrl.IndexOf("javascript") == -1)
                        {
                            if (nextUrl.IndexOf("http") == 0)
                            {
                                if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                    {
                                        try
                                        {
                                            Crawling(nextUrl, host);
                                            m_CompletedCount++;
                                        }
                                        catch { }
                                    }));
                                }
                            }
                            else
                            {
                                if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)
                                    {
                                        try
                                        {
                                            Crawling(host + nextUrl, host);
                                            m_CompletedCount++;
                                        }
                                        catch { }
                                    }));
                                }
                            }
                        }
                    }
                    catch { }
                }
            }
        } //end Crawling方法

        /// <summary>
        /// 獲取主機
        /// </summary>
        private string GetHost(string url)
        {
            Regex regHost = new Regex(@"(?:http|https)://[a-z0-9\-\.:]+", RegexOptions.IgnoreCase);
            Match mHost = regHost.Match(url);
            return mHost.Value + "/";
        }

        //計時器事件
        private void timer1_Tick(object sender, EventArgs e)
        {
            int workerThreads;
            int completionPortThreads;
            ThreadPool.GetAvailableThreads(out workerThreads, out completionPortThreads);
            if (workerThreads == 100 && m_CompletedCount > 0)
            {
                lblMsg.Text = "已結束";
            }
            else
            {
                lblMsg.Text = "正在爬取圖片…";
            }
        }

        /// <summary>
        /// 獲取圖片寬度或高度
        /// </summary>
        private int GetImageWidthOrHeight(string imageTagString, bool isWidth)
        {
            string tag = isWidth ? "width" : "height";
            Regex reg = new Regex(string.Format(@"{0}=""([\d\.]+)""", tag), RegexOptions.IgnoreCase);
            Match match = reg.Match(imageTagString);
            if (match.Success)
            {
                return (int)Convert.ToDouble(match.Groups[1].Value);
            }
            else
            {
                reg = new Regex(string.Format(@"{0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;", tag), RegexOptions.IgnoreCase);
                match = reg.Match(imageTagString);
                if (match.Success)
                {
                    return (int)Convert.ToDouble(match.Groups[1].Value);
                }
            }
            return int.MaxValue;
        }

    } //end Form1類

    /// <summary>
    /// 跨線程訪問控件的委託
    /// </summary>
    public delegate void InvokeDelegate();
}
View Code

截圖:網絡

相關文章
相關標籤/搜索