同窗們能夠去各大招聘網站查看一下爬蟲工程師的要求,大可能是招JAVA、PYTHON,甚至於還有NODEJS,C++;再或者去開源中國查詢C#的爬蟲項目,僅有幾個很是簡單或是幾年沒有更新的項目。html
而單純性能上.NET對比JAVA,PYTHON並無處於弱勢,反而有開發上的優點(得益於世界上最強大的IDE)。爬蟲性能瓶頸大可能是在併發下載(網速)、IP池,那麼爲何.NET沒有一個強大的爬蟲框架呢?說真的我不知道,可能爬蟲框架核心上比較簡單,也可能.NET的開發人員沒有別的語言的開發人員勤奮,或是.NET的開源氛圍沒有別的語言高。直到.NET要出開源版的消息傳來,我以爲是時候開發一個跨平臺,跨語言的爬蟲框架了。但一開始是比較忐忑的,以爲本身水平不夠去徹底從新設計一個新的框架出來,所以參考了JAVA的一個輕量級爬蟲框架webmagic,並加入了我本身的理解和改進。若是設計或寫得很差請你們指正海涵mysql
因爲我是參考的webmagic,因此總體架構上沒有什麼大的變化,設計圖以下(圖片是直接從webmagic上拿的)git
基本使用只須要引用DotnetSpider2.Core(Nuget中獲取)github
DotnetSpider實現一個完整爬蟲是須要4個模塊的:Scheduler、Downloader、PageProcessor、Pipeline。因爲Downloader和Scheduler都是有基本實現的,所以只須要實現PageProcessor和Pipeline就能夠實現一個基本爬蟲了,這種方式也是最自由的方式。web
徹底自定義的例子以下:sql
public static void Main(string[] args) { // Custmize processor and pipeline 徹底自定義頁面解析和數據管道 BaseUsage.CustmizeProcessorAndPipeline(); Console.WriteLine("Press any key to continue..."); Console.Read(); } public static void CustmizeProcessorAndPipeline() { // Config encoding, header, cookie, proxy etc... 定義採集的 Site 對象, 設置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始採集連接 site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html"); } Spider spider = Spider.Create(site, // use memoery queue scheduler. 使用內存調度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 爲優酷自定義的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 爲優酷自定義的 Pipeline .AddPipeline(new YoukuPipeline()) // dowload html by http client .SetDownloader(new HttpClientDownloader()) // 1 thread .SetThreadNum(1); spider.EmptySleepTime = 3000; // Start crawler 啓動爬蟲 spider.Run(); } public class YoukuPipeline : BasePipeline { private static long count = 0; public override void Process(ResultItems resultItems) { foreach (YoukuVideo entry in resultItems.Results["VideoResult"]) { count++; Console.WriteLine($"[YoukuVideo {count}] {entry.Name}"); } // Other actions like save data to DB. 能夠自由實現插入數據庫或保存到文件 } } public class YoukuPageProcessor : BasePageProcessor { protected override void Handle(Page page) { // 利用 Selectable 查詢並構造本身想要的數據對象 var totalVideoElements = page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-pack pack-film']")).Nodes(); List<YoukuVideo> results = new List<YoukuVideo>(); foreach (var videoElement in totalVideoElements) { var video = new YoukuVideo(); video.Name = videoElement.Select(Selectors.XPath(".//img[@class='quic']/@alt")).GetValue(); results.Add(video); } // Save data object by key. 以自定義KEY存入page對象中供Pipeline調用 page.AddResultItem("VideoResult", results); // Add target requests to scheduler. 解析須要採集的URL foreach (var url in page.Selectable.SelectList(Selectors.XPath("//ul[@class='yk-pages']")).Links().Nodes()) { page.AddTargetRequest(new Request(url.GetValue(), null)); } } } public class YoukuVideo { public string Name { get; set; } }
配置式爬蟲須要額外引用DotnetSpider2.Extension(Nuget中獲取)數據庫
大部分狀況下只須要配置式來實現一個採集任務。相對於基本使用方式,配置式爬式只須要短短的幾行代碼就能夠實現一個爬蟲。但凡事有利就有弊,配置式爬的自由度相對低了一些。瀏覽器
使用配置式爬蟲的步驟以下:cookie
完整代碼以下, 感覺一下就好,後面章節會詳細介紹如何實現:架構
public class JdSkuSampleSpider : EntitySpiderBuilder { protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site { //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API")) }); context.SetThreadNum(1); context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); // dowload html by http client context.SetDownloader(new HttpClientDownloader()); // save data to mysql. context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手機" }, { "cat3", "655" } }); context.AddEntityType(typeof(Product)); return context; } [Schema("test", "sku", TableSuffix.Today)] [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] [Indexes(Index = new[] { "category" }, Unique = new[] { "category,sku", "sku" })] [TargetUrlsSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })] public class Product : ISpiderEntity { [StoredAs("sku", DataType.String, 25)] [PropertySelector(Expression = "./@data-sku")] public string Sku { get; set; } [StoredAs("category", DataType.String, 20)] [PropertySelector(Expression = "name", Type = SelectorType.Enviroment)] public string CategoryName { get; set; } [StoredAs("cat3", DataType.String, 20)] [PropertySelector(Expression = "cat3", Type = SelectorType.Enviroment)] public int CategoryId { get; set; } [StoredAs("url", DataType.Text)] [PropertySelector(Expression = "./div[1]/a/@href")] public string Url { get; set; } [StoredAs("commentscount", DataType.String, 32)] [PropertySelector(Expression = "./div[5]/strong/a")] public long CommentsCount { get; set; } [StoredAs("shopname", DataType.String, 100)] [PropertySelector(Expression = ".//div[@class='p-shop']/@data-shop_name")] public string ShopName { get; set; } [StoredAs("name", DataType.String, 50)] [PropertySelector(Expression = ".//div[@class='p-name']/a/em")] public string Name { get; set; } [StoredAs("venderid", DataType.String, 25)] [PropertySelector(Expression = "./@venderid")] public string VenderId { get; set; } [StoredAs("jdzy_shop_id", DataType.String, 25)] [PropertySelector(Expression = "./@jdzy_shop_id")] public string JdzyShopId { get; set; } [StoredAs("run_id", DataType.Date)] [PropertySelector(Expression = "Monday", Type = SelectorType.Enviroment)] public DateTime RunId { get; set; } [PropertySelector(Expression = "Now", Type = SelectorType.Enviroment)] [StoredAs("cdate", DataType.Time)] public DateTime CDate { get; set; } } }
public class Program { public static void Main(string[] args) { JdSkuSampleSpider spiderBuilder = new JdSkuSampleSpider(); spiderBuilder.Run(); } }
https://github.com/zlzforever/DotnetSpider 望各位大佬加星 :)
QQ羣: 477731655