官方網址:https://html-agility-pack.net/?z=codeplex、html
// From File 從文件獲取html信息 var doc = new HtmlDocument(); doc.Load(filePath); // From String 從字符串獲取html信息 var doc = new HtmlDocument(); doc.LoadHtml(html); // From Web 從網址獲取html信息 var url = "http://html-agility-pack.net/"; var web = new HtmlWeb(); var doc = web.Load(url);
var web = new HtmlWeb(); var doc = web.Load(url);
在 web
中咱們還能夠設置cookie、headers等信息,來處理一些特定的網站需求,好比須要登錄等。node
網頁在你查看網頁源代碼以後只是一段字符串,而爬蟲所作的就是在這堆字符串中,查詢到咱們想要的信息,挑選出來。
以往的篩選方法:正則 (太麻煩了,寫起來有些頭疼)
HtmlAgilityPack 支持經過XPath來解析咱們須要的信息。git
網頁右鍵檢查github
經過XPath就能夠準確獲取你想要元素的所有信息。web
獲取選中元素cookie
var web = new HtmlWeb(); var doc = web.Load(url); var htmlnode = doc?.DocumentNode?.SelectSingleNode("/html/body/header")
獲取元素信息async
htmlnode.InnerText; htmlnode.InnerHtml; //根據屬性取值 htmlnode?.GetAttributeValue("src", "未找到")
/// <summary> /// 下載HTML幫助類 /// </summary> public static class LoadHtmlHelper { /// <summary> /// 從Url地址下載頁面 /// </summary> /// <param name="url"></param> /// <returns></returns> public async static ValueTask<HtmlDocument> LoadHtmlFromUrlAsync(string url) { HtmlWeb web = new HtmlWeb(); return await web?.LoadFromWebAsync(url); } /// <summary> /// 獲取單個節點擴展方法 /// </summary> /// <param name="htmlDocument">文檔對象</param> /// <param name="xPath">xPath路徑</param> /// <returns></returns> public static HtmlNode GetSingleNode(this HtmlDocument htmlDocument, string xPath) { return htmlDocument?.DocumentNode?.SelectSingleNode(xPath); } /// <summary> /// 獲取多個節點擴展方法 /// </summary> /// <param name="htmlDocument">文檔對象</param> /// <param name="xPath">xPath路徑</param> /// <returns></returns> public static HtmlNodeCollection GetNodes(this HtmlDocument htmlDocument, string xPath) { return htmlDocument?.DocumentNode?.SelectNodes(xPath); } /// <summary> /// 獲取多個節點擴展方法 /// </summary> /// <param name="htmlDocument">文檔對象</param> /// <param name="xPath">xPath路徑</param> /// <returns></returns> public static HtmlNodeCollection GetNodes(this HtmlNode htmlNode, string xPath) { return htmlNode?.SelectNodes(xPath); } /// <summary> /// 獲取單個節點擴展方法 /// </summary> /// <param name="htmlDocument">文檔對象</param> /// <param name="xPath">xPath路徑</param> /// <returns></returns> public static HtmlNode GetSingleNode(this HtmlNode htmlNode, string xPath) { return htmlNode?.SelectSingleNode(xPath); } /// <summary> /// 下載圖片 /// </summary> /// <param name="url">地址</param> /// <param name="filpath">文件路徑</param> /// <returns></returns> public async static ValueTask<bool> DownloadImg(string url ,string filpath) { HttpClient httpClient = new HttpClient(); try { var bytes = await httpClient.GetByteArrayAsync(url); using (FileStream fs = File.Create(filpath)) { fs.Write(bytes, 0, bytes.Length); } return File.Exists(filpath); } catch (Exception ex) { throw new Exception("下載圖片異常", ex); } } }
數據存儲層沒有實現,懶得寫了,靠大家嘍,我是數據暫時存在了文件中
GitHub地址:https://github.com/ZhangQueque/quewaner.Crawler.git網站