以前用.NET作網頁採集實現採用正則表達式去匹配解析,比較繁瑣,花費時間較多,如果Html複雜的話真是欲哭無淚。
很早就聽過包HtmlAgilityPack,其是在.NET下用XPath來解析的HTML的一個類庫(包)。可是一直沒時間嘗試,簡單瞭解了下HtmlAgilityPack的API後,發現真是HTML解析利器,因而花些時間作一個例子記錄下。
本次是如下載博客園隨筆分類文章爲例,採用兩部分實現,第一部分是將採集到的文章放到集合變量中,第二部分是經過操做集合變量將文章下載到本地,
這樣作效率較低,由於能夠直接邊採集文章邊下載。暫時沒有考慮效率問題,僅僅只是實現功能。下面簡單闡述下。
獲取隨筆分類
根據輸入的博客名取得對應的隨筆分類。
![](http://static.javashuo.com/static/loading.gif)
/// <summary> /// 獲取博客分類 /// </summary> /// <param name=" uname"></param> /// <returns></returns> private static List< BlogType> GettBlogTypeList(string uname) { string url = "http://www.cnblogs.com/" + uname + "/mvc/blog/sidecolumn.aspx?blogApp=" + uname; string htmlStr = CommonHelper .GetRequestStr(url); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlStr); var nodes = doc.DocumentNode.SelectNodes("//div[@id='sidebar_postcategory']//a"); //隨筆分類 if (nodes == null || nodes.Count <= 0) return null ; List<BlogType > list = new List< BlogType>(); for (int i = 0; i < nodes.Count; i++) { var aUrl = nodes[i].Attributes["href" ].Value; var name = nodes[i].InnerText; list.Add( new BlogType () { BlogTypeUrl = aUrl, BlogTypeName = name.Contains( "(") ? name.Split('(')[0] : name,BlogTypeNameShow=name }); } return list; } public class BlogType { public string BlogTypeUrl { get; set; } public string BlogTypeName { get; set; } public string BlogTypeNameShow { get; set; } }
如獲取到的隨筆分類以下:
![](http://static.javashuo.com/static/loading.gif)
採集分類的文章
採用兩步實現,第一步獲取只包含標題和url的文章,第二步再獲取文章內容。
![](http://static.javashuo.com/static/loading.gif)
/// <summary> /// 根據分類獲取博客 /// </summary> /// <param name=" blogTypes"></param> /// <param name=" useTime"></param> /// <returns></returns> public static Dictionary< BlogType,List <BlogInfo>> GetBlogsByType( List<BlogType > blogTypes,out long useTime) { Stopwatch sw = new Stopwatch(); sw.Start(); Dictionary<BlogType , List< BlogInfo>> dic = new Dictionary< BlogType, List <BlogInfo>>(); foreach (var blogType in blogTypes) { List<BlogInfo > list = new List< BlogInfo>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml( CommonHelper.GetRequestStr(blogType.BlogTypeUrl)); var typeNameNode = doc.DocumentNode.SelectSingleNode("//div[@class='entrylist']/h1"); string typeName = typeNameNode.InnerText; var listPosttitleNodes = doc.DocumentNode.SelectNodes("//div[@class='entrylistPosttitle']/a"); if (listPosttitleNodes != null && listPosttitleNodes.Count > 0) { for (int i = 0; i < listPosttitleNodes.Count; i++) { Console.WriteLine("正在爬取文章【{0}】..." , listPosttitleNodes[i].InnerText); list.Add( new BlogInfo () { BlogUrl = listPosttitleNodes[i].Attributes[ "href"].Value, BlogTitle = listPosttitleNodes[i].InnerText, BlogTypeName = typeName }); } } dic.Add(blogType,list); } sw.Stop(); useTime = sw.ElapsedMilliseconds; return dic; } /// <summary> /// 獲取詳細的博客信息 /// </summary> /// <param name=" dic"></param> /// <param name=" useTime"></param> /// <returns></returns> public static Dictionary< BlogType, List <BlogInfo>> GetBlogDetail( Dictionary<BlogType , List<BlogInfo >> dic, out long useTime) { Stopwatch sw = new Stopwatch(); sw.Start(); HtmlDocument doc = new HtmlDocument(); for(int k=0;k<dic.Keys.Count;k++) { var blogType = dic.Keys.ElementAt(k); var list = dic[blogType]; for (int i = 0; i < list.Count; i++) { Console.WriteLine("正在獲取文章【{0}】內容..." , list[i].BlogTitle); doc.LoadHtml( CommonHelper.GetRequestStr(list[i].BlogUrl)); var bodyNode = doc.DocumentNode.SelectSingleNode("//div[@id='cnblogs_post_body']"); var dateNode = doc.DocumentNode.SelectSingleNode("//span[@id='post-date']"); var userNode = doc.DocumentNode.SelectSingleNode("//div[@class='postDesc']/a[1]"); list[i].BlogContent = bodyNode == null ? "內容獲取失敗" : bodyNode.InnerHtml; list[i].BlogPostTime = dateNode == null ? "發佈時間獲取失敗" : dateNode.InnerText; list[i].BlogName = userNode == null ? "用戶獲取失敗" : userNode.InnerText; } dic[blogType] = list; } sw.Stop(); useTime = sw.ElapsedMilliseconds; return dic; } public class BlogInfo { public string BlogUrl { get; set; } public string BlogName { get; set; } public string BlogTitle { get; set; } public string BlogContent { get; set; } public string BlogTypeName { get; set; } public string BlogPostTime { get; set; } }
下載到本地
根據上面採集到的文章再一步步下載到本地,期間分兩步,第一步下載圖片,第二步下載文章內容。
![](http://static.javashuo.com/static/loading.gif)
/// <summary> /// 下載 /// </summary> /// <param name=" dic"></param> /// <param name=" uname"></param> /// <param name=" useTime"></param> /// <returns></returns> public static string DowanloadBlog( Dictionary<BlogType , List< BlogInfo>> dic, string uname,out long useTime) { Stopwatch sw = new Stopwatch(); sw.Start(); int countFlag = 0; for (int i = 0; i < dic.Keys.Count; i++) { var blogType = dic.Keys.ElementAt(i); var blogList = dic[blogType]; var dicPath = AppDomain .CurrentDomain.BaseDirectory +"BlogFiles\\" + uname + "\\" + blogType.BlogTypeName; Console.WriteLine("<<開始處理分類【{0}】<<" , blogType.BlogTypeName); FileHelper.CreatePath(dicPath); var blogModel = new BlogInfo(); for (int j = 0; j < blogList.Count; j++) { countFlag++; try { Console.WriteLine("~~~~開始處理文章{0}【{1}】~~~~" , countFlag,blogModel.BlogTitle); blogModel = blogList[j]; var filePath = dicPath + "\\" + FileHelper.FilterInvalidChar(blogModel.BlogTitle, "_") + ".html" ; HtmlDocument doc = new HtmlDocument(); doc.DocumentNode.InnerHtml = blogModel.BlogContent; //處理圖片 Console.WriteLine("~~開始處理圖片" ); var imgPath = dicPath + "\\images" ; FileHelper.CreatePath(imgPath); SaveImage(doc, imgPath); Console.WriteLine("~~處理圖片完成" ); //去掉a標籤 var aNodes = doc.DocumentNode.SelectNodes("//a"); if (aNodes != null && aNodes.Count > 0) { for (int a = 0; a < aNodes.Count; a++) { if (aNodes[a].Attributes["href" ] != null && aNodes[a].Attributes[ "href"].Value != "#" ) { aNodes[a].Attributes[ "href"].Value = "javascript:void()" ; } } } doc.DocumentNode.InnerHtml = "<div id='div_head'>" + uname + " " + blogType.BlogTypeName + "</div><div id='div_title'>" + blogModel.BlogTitle + "<div><div id='div_body'>" + doc.DocumentNode.InnerHtml + "</div>"; doc.Save(filePath, Encoding.UTF8); Console.WriteLine("~~~~處理文章{0}【{1}】完畢~~~~" ,countFlag,blogModel.BlogTitle); } catch (Exception ex) { string errorMsg = DateTime .Now.ToString("yyyyMMdd HH:mm:ss") + "\r\n" + "url=" + blogModel.BlogUrl + "\r\n" + "title=" + blogModel.BlogTitle + "\r\n" + "errorMsg=" + ex.Message + "\r\n" + "stackTrace=" + ex.StackTrace + "\r\n\r\n\r\n"; Console.WriteLine("error>>處理文章【{0}】出現錯誤,開始記錄錯誤信息~~" , blogModel.BlogTitle); FileHelper.SaveTxtFile(dicPath, "errorLog.txt" , errorMsg, false); Console.WriteLine("error>>處理文章【{0}】出現錯誤,記錄錯誤信息完成~~" , blogModel.BlogTitle); } } Console.WriteLine("<<處理分類【{0}】完成<<" , blogType.BlogTypeName); } sw.Start(); useTime = sw.ElapsedMilliseconds; return AppDomain .CurrentDomain.BaseDirectory + "BlogFiles\\" + uname; } /// <summary> /// 保存圖片 /// </summary> /// <param name=" doc"></param> /// <param name=" filePath"></param> public static void SaveImage( HtmlDocument doc, string filePath) { var imgNodes = doc.DocumentNode.SelectNodes("//img"); if (imgNodes != null && imgNodes.Count > 0) { for (int i = 0; i < imgNodes.Count; i++) { try { string src = imgNodes[i].Attributes["src" ].Value; string fileName = "" ; if (src != null && src.Contains("/")) { fileName = src.Substring(src.LastIndexOf( "/") + 1); Console.WriteLine("~~開始下載圖片【{0}】~~" , fileName); string imgPath = filePath + "\\" + fileName; imgNodes[i].Attributes[ "src"].Value = imgPath; byte[] imgByte = CommonHelper .GetRequestByteArr(src); if (imgByte != null ) { FileHelper.SaveImage(imgPath, imgByte); Console.WriteLine("~~下載圖片【{0}】完成~~" , fileName); } else { Console.WriteLine("~~下載圖片【{0}】失敗~~" , fileName); } } } catch (Exception ex) { throw new Exception( "SaveImage_Error:" + ex.Message); } } } }
程序入口
主要代碼以下
![](http://static.javashuo.com/static/loading.gif)
var types = GettBlogTypeList(uname); long time1 = 0; long time2 = 0; long timeDownload = 0; Console.WriteLine("正在爬取,請耐心等待..." ); var blogList = GetBlogsByType(types,out time1); var blogDetailList = GetBlogDetail(blogList,out time2); Console.WriteLine("爬取完畢,開始下載..." ); string filePath=DowanloadBlog(blogDetailList, uname,out timeDownload); Console.WriteLine("**處理完畢,爬取用時{0}ms,下載用時{1}ms,{2}" , time1+time2, timeDownload, filePath); handlerRight = false;
演示效果
![](http://static.javashuo.com/static/loading.gif)
文件存儲在項目bin目錄下,一個用戶一個文件夾
![](http://static.javashuo.com/static/loading.gif)
![](http://static.javashuo.com/static/loading.gif)
按隨筆分類生成不一樣的文件夾
![](http://static.javashuo.com/static/loading.gif)
生成.html文件,一個分類的全部圖片都存在該分類下的images下。
![](http://static.javashuo.com/static/loading.gif)
![](http://static.javashuo.com/static/loading.gif)
完整源碼放在github下,
https://github.com/kungge/CommonTest/tree/dev/WebCollect
歡迎指出程序bug,提出優化意見,(●'◡'●)