文章地址html
https://blog.csdn.net/sD7O95O/article/details/78097556
安裝爬蟲框架 NUGET 安裝DotnetSpiderweb
建立HTTP協議數據包app
var site = new Site { CycleRetryTimes = 1, SleepTime = 200, Headers = new Dictionary<string, string>() { {"Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" }, {"Accept-Encoding","gzip, deflate, br" }, {"Accept-Language","zh-CN,zh;q=0.9" }, {"X-Requested-With","XMLHttpRequest" }, { "Referer", "https://blog.csdn.net/sD7O95O/article/details/78096027"}, { "Connection","keep-alive" }, { "Content-Type","text/html; charset=UTF-8" }, { "Host","blog.csdn.net"}, { "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"} } };
site.AddStartUrl(surl);
建立一個爬蟲框架
Spider spider = Spider.Create(site, // new QueueDuplicateRemovedScheduler(), new CsdnArticleProcessor() //這個自定義的處理器 ).AddPipeline(new CsdnArticlePipeline()); //自定義管道
設置爬蟲ide
spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000;
運行url
spider.Run();
CsdnArticleProcessor 處理器 xpath 分析就不貼了
public class CsdnArticleProcessor : BasePageProcessor { protected override void Handle(Page page) { //page.Selectable.SelectList(Selectors.XPath("//table[@id='ip_list']/tr[2]/td[2]/text()")).Nodes(); // 利用 Selectable 查詢並構造本身想要的數據對象 //xpath 獲得本身須要的數據 //... //page.AddResultItem("CountryResult", results);放入管道 } }
class CsdnArticlePipeline : BasePipeline { public override void Process(IEnumerable<ResultItems> resultItems, ISpider spider) { var conlist = resukt.GetResultItem("CountryResult");獲取數據 //DB處理 } }