[開源 .NET 跨平臺 Crawler 數據採集爬蟲框架: DotnetSpider] [三] 配置式爬蟲

時間 2019-11-13

標籤開源跨平臺 crawler 數據採集爬蟲框架 dotnetspider 配置欄目網絡爬蟲简体版

原文原文鏈接

[DotnetSpider 系列目錄]

上一篇介紹的基本的使用方式，自由度很高，可是編寫的代碼相對就多了。而我所在的行業其實大部分都是定題爬蟲, 只須要採集指定的頁面並結構化數據。爲了提升開發效率, 我實現了利用實體配置的方式來實現爬蟲html

建立 Console 項目

利用NUGET添加包mysql

DotnetSpider2.Extensiongit

定義配置式數據對象

數據對象必須繼承 SpiderEntity
EntityTableAttribute中能夠定義數據名稱、表名及表名後綴、索引、主鍵或者須要更新的字段
EntitySelector 定義從頁面數據中抽取數據對象的規則
TargetUrlsSelector定義符合規則(正則)的目標連接, 用於加入到隊列中

定義一個原始的數據對象類github

public class Product : SpiderEntity
{
}

使用Chrome打開京東商品頁 http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main

使用快捷鍵F12打開開發者工具
選中一個商品，並觀察Html結構

能夠看到每一個商品都在class爲gl-i-wrap j-sku-item的DIV下面，所以添加EntitySelector到數據對象Product的類名上面。（ XPath的寫法不是惟一的，不熟悉的能夠去W3CSCHOLL學習一下, 框架也支持使用Css甚至正則來選擇出正確的Html片斷）。正則表達式

        [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
        public class Product : SpiderEntity
        {
        }

添加數據庫及索引信息sql

        [EntityTable("test", "sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
        [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
        public class Product : SpiderEntity
        {
        }

假設你須要採集SKU信息，觀察HTML結構，計算出相對的XPath, 爲何是相對XPath？由於EntitySelector已經把HTML截成片斷了，內部的Html元素查詢都是相對於EntitySelector查詢出來的元素。最後再加上數據庫中列的信息數據庫

        [EntityTable("test", "sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
        [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
        public class Product : SpiderEntity
        {
            [PropertyDefine(Expression = "./@data-sku")]
            public string Sku { get; set; }
        }

爬蟲內部，連接是經過Request對象來存儲信息的，構造Request對象時能夠添加額外的屬性值，這時候容許數據對象從Request的額外屬性值中查詢數據架構

        [EntityTable("test", "sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
        [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
        public class Product : SpiderEntity
        {
            [PropertyDefine(Expression = "./@data-sku")]
            public string Sku { get; set; }

            [PropertyDefine(Expression = "name", Type = SelectorType.Enviroment)]
            public string Category { get; set; }
        }

配置爬蟲（繼承EntitySpider）

    public class JdSkuSampleSpider : EntitySpider
    {
        public JdSkuSampleSpider() : base("JdSkuSample", new Site
        {
            //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
        })
        {
        }

        protected override void MyInit(params string[] arguments)
        {
            Identity = Identity ?? "JD SKU SAMPLE";

            ThreadNum = 1;
            // dowload html by http client
            Downloader = new HttpClientDownloader();

            // storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode.
            AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"));
            AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手機" }, { "cat3", "655" } });
            AddEntityType<Product>();
        }
    }

其中AddStartUrl第二個參數Dictionary<string, object>就是用於Enviroment查詢的數據框架

TargetUrlsSelector，能夠配置數據連接的合法性驗證，以及目標URL的獲取。以下表示目標URL的獲取區域是由XPATH選擇，而且要符合正則表達式 &page=[0-9]+&ide

        [EntityTable("test", "jd_sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
        [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
        [TargetUrlsSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
        public class Product : SpiderEntity
        {
            [PropertyDefine(Expression = "./@data-sku")]
            public string Sku { get; set; }

            [PropertyDefine(Expression = "name", Type = SelectorType.Enviroment)]
            public string Category { get; set; }
        }

添加一個MySql的數據管道，只須要配置好鏈接字符串便可

context.AddPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));

完整代碼

    public class JdSkuSampleSpider : EntitySpider
    {
        public JdSkuSampleSpider() : base("JdSkuSample", new Site
        {
            //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
        })
        {
        }

        protected override void MyInit(params string[] arguments)
        {
            Identity = Identity ?? "JD SKU SAMPLE";

            ThreadNum = 1;
            // dowload html by http client
            Downloader = new HttpClientDownloader();

            // storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode.
            AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"));
            AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手機" }, { "cat3", "655" } });
            AddEntityType<Product>();
        }
    }

    [EntityTable("test", "jd_sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
    [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
    [TargetUrlsSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
    public class Product : SpiderEntity
    {
        [PropertyDefine(Expression = "./@data-sku", Length = 100)]
        public string Sku { get; set; }

        [PropertyDefine(Expression = "name", Type = SelectorType.Enviroment, Length = 100)]
        public string Category { get; set; }

        [PropertyDefine(Expression = "cat3", Type = SelectorType.Enviroment)]
        public int CategoryId { get; set; }

        [PropertyDefine(Expression = "./div[1]/a/@href")]
        public string Url { get; set; }

        [PropertyDefine(Expression = "./div[5]/strong/a")]
        public long CommentsCount { get; set; }

        [PropertyDefine(Expression = ".//div[@class='p-shop']/@data-shop_name", Length = 100)]
        public string ShopName { get; set; }

        [PropertyDefine(Expression = ".//div[@class='p-name']/a/em", Length = 100)]
        public string Name { get; set; }

        [PropertyDefine(Expression = "./@venderid", Length = 100)]
        public string VenderId { get; set; }

        [PropertyDefine(Expression = "./@jdzy_shop_id", Length = 100)]
        public string JdzyShopId { get; set; }

        [PropertyDefine(Expression = "Monday", Type = SelectorType.Enviroment)]
        public DateTime RunId { get; set; }
    }

運行爬蟲

public class Program
{
    public static void Main(string[] args)
    {
        JdSkuSampleSpider spider = new JdSkuSampleSpider();
        spider.Run();
    }
}