C#備份博客園隨筆文章和圖片----使用HtmlAgilityPack解析html

以前用.NET作網頁採集實現採用正則表達式去匹配解析,比較繁瑣,花費時間較多,如果Html複雜的話真是欲哭無淚。
很早就聽過包HtmlAgilityPack,其是在.NET下用XPath來解析的HTML的一個類庫(包)。可是一直沒時間嘗試,簡單瞭解了下HtmlAgilityPack的API後,發現真是HTML解析利器,因而花些時間作一個例子記錄下。
 
本次是如下載博客園隨筆分類文章爲例,採用兩部分實現,第一部分是將採集到的文章放到集合變量中,第二部分是經過操做集合變量將文章下載到本地,
這樣作效率較低,由於能夠直接邊採集文章邊下載。暫時沒有考慮效率問題,僅僅只是實現功能。下面簡單闡述下。
 
獲取隨筆分類
 
根據輸入的博客名取得對應的隨筆分類。
 
複製代碼
   /// <summary>
        /// 獲取博客分類
        /// </summary>
        /// <param name=" uname"></param>
        /// <returns></returns>
        private static List< BlogType> GettBlogTypeList(string uname)
        {
            string url = "http://www.cnblogs.com/" + uname + "/mvc/blog/sidecolumn.aspx?blogApp=" + uname;
            string htmlStr = CommonHelper .GetRequestStr(url);
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(htmlStr);
            var nodes = doc.DocumentNode.SelectNodes("//div[@id='sidebar_postcategory']//a"); //隨筆分類
            if (nodes == null || nodes.Count <= 0)
                return null ;

            List<BlogType > list = new List< BlogType>();
            for (int i = 0; i < nodes.Count; i++)
            {
                var aUrl = nodes[i].Attributes["href" ].Value;
                var name = nodes[i].InnerText;
                list.Add( new BlogType () { BlogTypeUrl = aUrl, BlogTypeName = name.Contains( "(") ? name.Split('(')[0] : name,BlogTypeNameShow=name });
            }
            return list;
        }

 

  public class BlogType
    {
        public string BlogTypeUrl { get; set; }
        public string BlogTypeName { get; set; }
        public string BlogTypeNameShow { get; set; }
    }
複製代碼

 

 如獲取到的隨筆分類以下:
 
 
採集分類的文章
 
採用兩步實現,第一步獲取只包含標題和url的文章,第二步再獲取文章內容。
      
複製代碼
 /// <summary>
        /// 根據分類獲取博客
        /// </summary>
        /// <param name=" blogTypes"></param>
        /// <param name=" useTime"></param>
        /// <returns></returns>
        public static Dictionary< BlogType,List <BlogInfo>> GetBlogsByType( List<BlogType > blogTypes,out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            Dictionary<BlogType , List< BlogInfo>> dic = new Dictionary< BlogType, List <BlogInfo>>();          
            foreach (var blogType in blogTypes)
            {
                List<BlogInfo > list = new List< BlogInfo>();
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml( CommonHelper.GetRequestStr(blogType.BlogTypeUrl));
                var typeNameNode = doc.DocumentNode.SelectSingleNode("//div[@class='entrylist']/h1");
                string typeName = typeNameNode.InnerText;
                var listPosttitleNodes = doc.DocumentNode.SelectNodes("//div[@class='entrylistPosttitle']/a");
                if (listPosttitleNodes != null && listPosttitleNodes.Count > 0)
                {
                    for (int i = 0; i < listPosttitleNodes.Count; i++)
                    {
                        Console.WriteLine("正在爬取文章【{0}】..." , listPosttitleNodes[i].InnerText);
                        list.Add( new BlogInfo ()
                        {
                            BlogUrl = listPosttitleNodes[i].Attributes[ "href"].Value,
                            BlogTitle = listPosttitleNodes[i].InnerText,
                            BlogTypeName = typeName
                        });
                    }
                }

                dic.Add(blogType,list);

            }

            sw.Stop();
            useTime = sw.ElapsedMilliseconds;
            return dic;
        }

 

 

     /// <summary>
        /// 獲取詳細的博客信息
        /// </summary>
        /// <param name=" dic"></param>
        /// <param name=" useTime"></param>
        /// <returns></returns>
        public static Dictionary< BlogType, List <BlogInfo>> GetBlogDetail( Dictionary<BlogType , List<BlogInfo >> dic, out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            HtmlDocument doc = new HtmlDocument();
            for(int k=0;k<dic.Keys.Count;k++)
            {
                var blogType = dic.Keys.ElementAt(k);
                var list = dic[blogType];
                for (int i = 0; i < list.Count; i++)
                {
                    Console.WriteLine("正在獲取文章【{0}】內容..." , list[i].BlogTitle);
                    doc.LoadHtml( CommonHelper.GetRequestStr(list[i].BlogUrl));
                    var bodyNode = doc.DocumentNode.SelectSingleNode("//div[@id='cnblogs_post_body']");
                    var dateNode = doc.DocumentNode.SelectSingleNode("//span[@id='post-date']");
                    var userNode = doc.DocumentNode.SelectSingleNode("//div[@class='postDesc']/a[1]");
                    list[i].BlogContent = bodyNode == null ? "內容獲取失敗" : bodyNode.InnerHtml;
                    list[i].BlogPostTime = dateNode == null ? "發佈時間獲取失敗" : dateNode.InnerText;
                    list[i].BlogName = userNode == null ? "用戶獲取失敗" : userNode.InnerText;
                }
                dic[blogType] = list;
            }
            sw.Stop();
            useTime = sw.ElapsedMilliseconds;
            return dic;
        }

 

    public class BlogInfo
    {
        public string BlogUrl { get; set; }
        public string BlogName { get; set; }
        public string BlogTitle { get; set; }
        public string BlogContent { get; set; }
        public string BlogTypeName { get; set; }
        public string BlogPostTime { get; set; }
    }
複製代碼

 

 下載到本地
 
根據上面採集到的文章再一步步下載到本地,期間分兩步,第一步下載圖片,第二步下載文章內容。
 
 /// <summary>

        /// 下載

        /// </summary>

        /// <param name=" dic"></param>

        /// <param name=" uname"></param>

        /// <param name=" useTime"></param>

        /// <returns></returns>

        public static string DowanloadBlog( Dictionary<BlogType , List< BlogInfo>> dic, string uname,out long useTime)

        {

            Stopwatch sw = new Stopwatch();

            sw.Start();

            int countFlag = 0;

            for (int i = 0; i < dic.Keys.Count; i++)

            {

                var blogType = dic.Keys.ElementAt(i);

                var blogList = dic[blogType];

                var dicPath = AppDomain .CurrentDomain.BaseDirectory +"BlogFiles\\" + uname + "\\" + blogType.BlogTypeName;

                Console.WriteLine("<<開始處理分類【{0}】<<" , blogType.BlogTypeName);

                FileHelper.CreatePath(dicPath);

                var blogModel = new BlogInfo();

                for (int j = 0; j < blogList.Count; j++)

                {

                    countFlag++;

                    try

                    {

                        Console.WriteLine("~~~~開始處理文章{0}【{1}】~~~~" , countFlag,blogModel.BlogTitle);

                        blogModel = blogList[j];

                        var filePath = dicPath + "\\" + FileHelper.FilterInvalidChar(blogModel.BlogTitle, "_") + ".html" ;

                        HtmlDocument doc = new HtmlDocument();

                        doc.DocumentNode.InnerHtml = blogModel.BlogContent;

 

                        //處理圖片

                        Console.WriteLine("~~開始處理圖片" );

                        var imgPath = dicPath + "\\images" ;

                        FileHelper.CreatePath(imgPath);

                        SaveImage(doc, imgPath);

                        Console.WriteLine("~~處理圖片完成" );

 

                        //去掉a標籤

                        var aNodes = doc.DocumentNode.SelectNodes("//a");

                        if (aNodes != null && aNodes.Count > 0)

                        {

                            for (int a = 0; a < aNodes.Count; a++)

                            {

                                if (aNodes[a].Attributes["href" ] != null && aNodes[a].Attributes[ "href"].Value != "#" )

                                {

                                    aNodes[a].Attributes[ "href"].Value = "javascript:void()" ;

                                }

                            }

                        }

                        doc.DocumentNode.InnerHtml = "<div id='div_head'>" + uname + " " + blogType.BlogTypeName + "</div><div id='div_title'>" + blogModel.BlogTitle + "<div><div id='div_body'>" + doc.DocumentNode.InnerHtml + "</div>";

                        doc.Save(filePath, Encoding.UTF8);

                        Console.WriteLine("~~~~處理文章{0}【{1}】完畢~~~~" ,countFlag,blogModel.BlogTitle);

                    }

                    catch (Exception ex)

                    {

                        string errorMsg = DateTime .Now.ToString("yyyyMMdd HH:mm:ss") + "\r\n" + "url=" + blogModel.BlogUrl + "\r\n" + "title=" + blogModel.BlogTitle + "\r\n" + "errorMsg=" + ex.Message + "\r\n" + "stackTrace=" + ex.StackTrace + "\r\n\r\n\r\n";

                        Console.WriteLine("error>>處理文章【{0}】出現錯誤,開始記錄錯誤信息~~" , blogModel.BlogTitle);

                        FileHelper.SaveTxtFile(dicPath, "errorLog.txt" , errorMsg, false);

                        Console.WriteLine("error>>處理文章【{0}】出現錯誤,記錄錯誤信息完成~~" , blogModel.BlogTitle);

                    }

                }

                Console.WriteLine("<<處理分類【{0}】完成<<" , blogType.BlogTypeName);

 

            }

            sw.Start();

            useTime = sw.ElapsedMilliseconds;

            return AppDomain .CurrentDomain.BaseDirectory + "BlogFiles\\" + uname;

        }

 

 /// <summary>

        /// 保存圖片

        /// </summary>

        /// <param name=" doc"></param>

        /// <param name=" filePath"></param>

        public static void SaveImage( HtmlDocument doc, string filePath)

        {

            var imgNodes = doc.DocumentNode.SelectNodes("//img");

            if (imgNodes != null && imgNodes.Count > 0)

            {

                for (int i = 0; i < imgNodes.Count; i++)

                {

                    try

                    {                     

                        string src = imgNodes[i].Attributes["src" ].Value;

                        string fileName = "" ;

                        if (src != null && src.Contains("/"))

                        {

                            fileName = src.Substring(src.LastIndexOf( "/") + 1);

                            Console.WriteLine("~~開始下載圖片【{0}】~~" , fileName);

                            string imgPath = filePath + "\\" + fileName;

                            imgNodes[i].Attributes[ "src"].Value = imgPath;

                            byte[] imgByte = CommonHelper .GetRequestByteArr(src);

                            if (imgByte != null )

                            {

                                FileHelper.SaveImage(imgPath, imgByte);

                                Console.WriteLine("~~下載圖片【{0}】完成~~" , fileName);

                            }

                            else

                            {

                                Console.WriteLine("~~下載圖片【{0}】失敗~~" , fileName);

                            }

                        }

                    }

                    catch (Exception ex)

                    {

                        throw new Exception( "SaveImage_Error:" + ex.Message);

                    }

 

                }

            }

        }
View Code

 

程序入口
 
主要代碼以下
                 
複製代碼
    var types = GettBlogTypeList(uname);
                    long time1 = 0;
                    long time2 = 0;
                    long timeDownload = 0;
                    Console.WriteLine("正在爬取,請耐心等待..." );
                    var blogList = GetBlogsByType(types,out time1);
                    var blogDetailList = GetBlogDetail(blogList,out time2);
                    Console.WriteLine("爬取完畢,開始下載..." );
                    string filePath=DowanloadBlog(blogDetailList, uname,out timeDownload);
                    Console.WriteLine("**處理完畢,爬取用時{0}ms,下載用時{1}ms,{2}" , time1+time2, timeDownload, filePath);
                    handlerRight = false;
 
複製代碼

 

演示效果
 
 
 
文件存儲在項目bin目錄下,一個用戶一個文件夾
 
按隨筆分類生成不一樣的文件夾
 
 
 
生成.html文件,一個分類的全部圖片都存在該分類下的images下。
 
 

歡迎指出程序bug,提出優化意見,(●'◡'●)
 

出處:https://www.cnblogs.com/kungge/p/5956501.htmljavascript

==========================================================================html

上面的獲取文章分類已經沒法使用了,查看博客園的API說明文檔,參考文章以下,java

http://wcf.open.cnblogs.com/blog/helpnode

根據這個裏面的提示,咱們能夠分兩步備份文章,先獲取文章標題和摘要,而後在下載每一篇文章,保存文章和圖片到本地文件夾git

首先咱們定義一個博客文章類github

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs.Models
{
    public class BlogInfo
    {
        public string id { get; set; }

        public string title { get; set; }

        public string summary { get; set; }

        public string published { get; set; }

        public string updated { get; set; }

        public string link { get; set; }

        public string diggs { get; set; }

        public string views { get; set; }

        public string comments { get; set; }

        public string body { get; set; }
    }
}
View Code

在定義兩個工具類正則表達式

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs.Common
{
    public static class CommonHelper
    {
        #region HttpClient
        private static HttpClient _httpClient;
        public static HttpClient httpClient
        {
            get
            {
                if (_httpClient == null)
                {
                    _httpClient = new HttpClient();
                    _httpClient.Timeout = new TimeSpan(0, 4, 0);

                }
                return _httpClient;
            }
            set { _httpClient = value; }
        }

        #endregion

        #region get請求
        /// <summary>
        /// get請求返回的字符串
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetRequestStr(string url)
        {
            try
            {
                var response = httpClient.GetAsync(new Uri(url)).Result;
                return response.Content.ReadAsStringAsync().Result;
            }
            catch (Exception ex)
            {
                FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  請求URL發生異常:" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                return null;
            }
        }
        /// <summary>
        /// get請求返回的二進制
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static byte[] GetRequestByteArr(string url)
        {
            try
            {
                if (url.Contains("://"))
                {
                    var response = httpClient.GetAsync(new Uri(url)).Result;
                    return response.Content.ReadAsByteArrayAsync().Result;
                }
                else if (url.IndexOf("data:image") == 0)
                {
                    return Convert.FromBase64String(url.Split(';')[1].Split(',')[1]);
                }
                else
                    return null;
            }
            catch (Exception ex)
            {
                FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  請求URL發生異常:" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                return null;
            }
        }
        #endregion

        #region post請求
        /// <summary>
        /// post請求返回的字符串
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string PostRequestStr(string url)
        {
            try
            {
                string contentStr = "";
                StringContent sc = new StringContent(contentStr);
                sc.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("application/x-www-form-urlencoded");//todo
                var response = httpClient.PostAsync(new Uri(url), sc).Result;
                return response.Content.ReadAsStringAsync().Result;
            }
            catch (Exception ex)
            {
                FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  請求URL發生異常:" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                return null;
            }
        }
        #endregion

        #region MD5加密解密

        /// <summary>
        /// 用MD5加密字符串,可選擇生成16位或者32位的加密字符串
        /// </summary>
        /// <param name="password">待加密的字符串</param>
        /// <param name="bit">位數,通常取值16 或 32</param>
        /// <returns>返回的加密後的字符串</returns>
        public static string MD5Encrypt(string strWord, int bit)
        {
            string tmp = MD5Encrypt(strWord);
            if (bit == 16)
                return tmp.ToString().Substring(8, 16);
            else if (bit == 32)
                return tmp.ToString();//默認狀況
            else
                return string.Empty;
        }

        /// <summary>
        /// 用MD5加密字符串
        /// </summary>
        /// <param name="password">待加密的字符串</param>
        /// <returns></returns>
        public static string MD5Encrypt(string strWord)
        {
            System.Security.Cryptography.MD5CryptoServiceProvider md5Hasher = new System.Security.Cryptography.MD5CryptoServiceProvider();
            byte[] hashedDataBytes;
            hashedDataBytes = md5Hasher.ComputeHash(Encoding.GetEncoding("gb2312").GetBytes(strWord));
            StringBuilder tmp = new StringBuilder();
            foreach (byte i in hashedDataBytes)
            {
                tmp.Append(i.ToString("x2"));
            }
            return tmp.ToString();
        }
        #endregion
    }
}
View Code
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs.Common
{
    public static class FileHelper
    {
        #region 建立路徑
        /// <summary>
        /// 建立路徑
        /// </summary>
        /// <param name="path"></param>
        public static bool CreatePath(string path)
        {
            if (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
                return true;
            }
            return false;
        }
        #endregion

        #region 保存圖片
        /// <summary>
        /// 保存圖片
        /// </summary>
        /// <param name="bt"></param>
        public static void SaveImage(string filePath, byte[] bt)
        {
            try
            {
                File.WriteAllBytes(filePath, bt);
            }
            catch (Exception ex)
            {
                Console.WriteLine("SaveImage 方法發生異常:" + ex.Message);
            }
        }
        #endregion

        #region 保存文本文件
        public static void SaveTxtFile(string filePath, string txtStr, bool isCover = true)
        {
            try
            {
                CreatePath(System.IO.Path.GetDirectoryName(filePath));
                if (isCover)
                    File.WriteAllText(filePath, txtStr, Encoding.Default);
                else
                    File.AppendAllText(filePath, txtStr, Encoding.Default);

            }
            catch (Exception ex)
            {
                Console.WriteLine("SaveTxtFile 方法發生異常:" + ex.Message);
            }
        }
        #endregion

        #region 過濾文件名中特殊字符
        public static string FilterInvalidChar(string fileName, string replaceStr)
        {
            foreach (var c in Path.GetInvalidFileNameChars())
            {
                fileName = fileName.Replace(c.ToString(), replaceStr);
            }
            fileName = fileName.Replace(" ", replaceStr);
            return fileName;
        }
        #endregion
    }
}
View Code

最後,再來個主要的類多線程

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs
{
    using Common;
    using HtmlAgilityPack;
    using Models;
    using System.Diagnostics;

    class Program
    {
        private static string userPath = "";
        private static string blogPath = "";
        private static string imgPath = "";
        private static string htmlTemp = @"
<!DOCTYPE HTML>
<html>
<head>
<title>C#備份博客園文章列表</title>
</head>
<body>
</body>
</html>";

        static void Main(string[] args)
        {
            //輸入博客名稱
            string uname = "";
            bool unameNull = true;

            do
            {
                ShowLog("--請輸入要下載的博客名稱--");
                uname = Console.ReadLine();
                if (string.IsNullOrEmpty(uname))
                {
                    ShowLog("--請輸入要下載的博客名稱--");
                    uname = Console.ReadLine();
                }
                else
                {
                    unameNull = false;
                }
            } while (unameNull);

            //獲取博客標題
            bool hasTypes = true;
            List<BackupBlogs.Models.BlogInfo> blogList = new List<Models.BlogInfo>();
            do
            {

                userPath = AppDomain.CurrentDomain.BaseDirectory + "cnblogFiles\\" + uname + "_" + DateTime.Now.ToString("yyyyMMdd");
                blogPath = userPath + "\\Blogs";
                FileHelper.CreatePath(blogPath);
                imgPath = userPath + "\\Images";
                FileHelper.CreatePath(imgPath);

                blogList = GettBlogSummaryList(uname);
                if (blogList == null || blogList.Count == 0)
                {
                    ShowLog("--未獲取到文章列表,請從新輸入要下載的博客名稱--", true);
                    uname = Console.ReadLine();
                    blogList = GettBlogSummaryList(uname);
                }
                else
                {
                    hasTypes = false;
                }
            } while (hasTypes);

            //保存標題列表
            SaveBlogsSummary(blogList);

            //保存博客詳細文章
            bool handlerRight = true;
            do
            {
                long time1 = 0;
                long time2 = 0;
                long timeDownload = 0;
                ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                ShowLog($"--正在爬取博客文章,共計 {blogList.Count} 篇,請耐心等待...", true);
                var blogDetailCount = GetBlogDetail(blogList, out time2);
                ShowLog($"--爬取完畢,成功下載了【{blogDetailCount}/{blogList.Count}】篇博客文章.", true);
                ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                int saveCount = SaveBlogDetail(blogList, uname, out timeDownload);
                ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                ShowLog(saveCount == blogList.Count ? "保存所有文章成功!" : $"保存文章【{saveCount}】條成功");
                ShowLog($"--處理完畢,爬取用時{(time1 + time2)}ms,下載用時{timeDownload}ms,\r\n保存路徑:{userPath}", true);
                handlerRight = false;

            } while (handlerRight);



            Console.ReadKey();
        }



        #region 獲取博客標題
        /// <summary>
        /// 獲取博客分類
        /// </summary>
        /// <param name="uname"></param>
        /// <returns></returns>
        private static List<BackupBlogs.Models.BlogInfo> GettBlogSummaryList(string uname)
        {
            string msgTitle = "第1階段:";
            string url = "";
            List<BlogInfo> list = new List<BlogInfo>();
            ShowLog(msgTitle + $"獲取{uname}的隨筆以下:", true);
            HtmlDocument doc = new HtmlDocument();
            bool isGetBlog = true;
            int pageNum = 1;
            int pageSize = 10;
            do
            {
                int currPageBlogCount = 0;
                url = "http://wcf.open.cnblogs.com/blog/u/" + uname + $"/posts/{pageNum}/{pageSize}";
                string htmlStr = CommonHelper.GetRequestStr(url);
                doc.LoadHtml(htmlStr);
                var nodes = doc.DocumentNode.SelectNodes("//entry");
                if (nodes == null || nodes.Count <= 0)
                    isGetBlog = false;
                else
                {
                    foreach (var item in nodes)
                    {
                        currPageBlogCount++;
                        if (item.ChildNodes.Count != 10)
                            continue;
                        BlogInfo blogSummary = new BlogInfo()
                        {
                            id = item.ChildNodes["id"].InnerText,
                            comments = item.ChildNodes["comments"].InnerText,
                            diggs = item.ChildNodes["diggs"].InnerText,
                            link = item.ChildNodes["link"].Attributes["href"].Value,
                            published = item.ChildNodes["published"].InnerText,
                            summary = item.ChildNodes["summary"].InnerText,
                            title = item.ChildNodes["title"].InnerText,
                            updated = item.ChildNodes["updated"].InnerText,
                            views = item.ChildNodes["views"].InnerText
                        };
                        list.Add(blogSummary);
                        ShowLog(msgTitle + $"【{currPageBlogCount + (pageNum - 1) * pageSize}】獲取文章標題【{blogSummary.title}】", true);
                    }
                }
                //isGetBlog = false;
                pageNum++;
            } while (isGetBlog);
            return list;
        }


        private static void SaveBlogsSummary(List<BackupBlogs.Models.BlogInfo> blogSummaries)
        {

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(htmlTemp);
            HtmlNode nList = doc.DocumentNode.SelectSingleNode("/html/body");
            nList.AppendChild(HtmlNode.CreateNode("<table border='1' cellpadding='0' cellspacing='0' width='98%'><tr>" +
                "<th width='150'>發佈時間</th>" +
                "<th width='100'>閱讀數</th>" +
                "<th width='100'>評論數</th>" +
                "<th width='*'>博文標題</th>" +
                "</tr></table>"));
            ShowLog(Environment.NewLine + "開始保存博客文章標題");
            foreach (var item in blogSummaries)
            {
                string div = "<div>";
                //div += "&nbsp;&nbsp;&nbsp;&nbsp;<a href='" + item.link + "' target='_blank' >原文</a>";
                div += "【發佈:" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "";
                //div += "【更新:" + DateTime.Parse(item.updated).ToString("yyyy-MM-dd HH:mm:ss") + "】";
                div += "【閱讀數:" + item.views.PadLeft(4, '0') + "";
                div += "【評論數:" + item.comments.PadLeft(3, '0') + "";
                div += "<a href='.\\Blogs\\" + System.Web.HttpUtility.UrlEncode(FileHelper.FilterInvalidChar(item.title, "_")) + ".html" + "' target='_blank'>" + item.title + "</a>";
                div += "</div>";

                string divTR = "<tr>";
                //divTR += "&nbsp;&nbsp;&nbsp;&nbsp;<a href='" + item.link + "' target='_blank' >原文</a>";
                divTR += "<td>" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "</td>";
                divTR += "<td align='right'>" + item.views + "</td>";
                divTR += "<td align='right'>" + item.comments + "</td>";
                divTR += "<td><a href='.\\Blogs\\" + System.Web.HttpUtility.UrlEncode(FileHelper.FilterInvalidChar(item.title, "_")) + ".html" + "' target='_blank'>" + item.title + "</a></td>";
                divTR += "</tr>";
                nList.SelectSingleNode("table").AppendChild(HtmlNode.CreateNode(divTR));
            }
            doc.Save(userPath + "\\index.html", Encoding.UTF8);
            ShowLog($"共保存【{blogSummaries.Count}】篇博客標題保存完成", true);
        }

        #endregion


        #region 獲取博客詳細信息
        /// <summary>
        /// 獲取詳細的博客信息
        /// </summary>
        /// <param name="dic"></param>
        /// <param name="useTime"></param>
        /// <returns></returns>
        private static int GetBlogDetail(List<BlogInfo> blogs, out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            string msgTitle = "第2階段:";
            int GetDetailCount = 0;
            HtmlDocument doc = new HtmlDocument();

            for (int k = 0; k < blogs.Count; k++)
            {
                string url = $"http://wcf.open.cnblogs.com/blog/post/body/{blogs[k].id}";
                ShowLog(msgTitle + string.Format("【{0}/{1}】正在獲取文章【{2}】", k + 1, blogs.Count, blogs[k].title), true);
                string blogBody = CommonHelper.GetRequestStr(url);
                doc.LoadHtml(blogBody);
                var bodyNode = doc.DocumentNode.SelectSingleNode("//string");
                blogs[k].body = bodyNode == null ? "內容獲取失敗" : System.Web.HttpUtility.HtmlDecode(bodyNode.InnerHtml);
            }
            ShowLog("下載失敗的文章以下:", true);
            var errBlogs = blogs.Where(x => x.body == "內容獲取失敗");
            foreach (var item in errBlogs)
            {
                ShowLog(Newtonsoft.Json.JsonConvert.SerializeObject(item), true);
            }
            GetDetailCount = blogs.Count - errBlogs.Count();
            sw.Stop();
            useTime = sw.ElapsedMilliseconds;
            return GetDetailCount;
        }
        #endregion


        #region 保存博客
        /// <summary>
        /// 保存圖片
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="filePath"></param>
        private static void SaveImage(BlogInfo blog, string filePath, string uname)
        {
            string msgTitle = "第3階段:";
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(blog.body);
            var imgNodes = doc.DocumentNode.SelectNodes("//img");
            if (imgNodes != null && imgNodes.Count > 0)
            {
                for (int i = 0; i < imgNodes.Count; i++)
                {
                    try
                    {
                        string src = imgNodes[i].Attributes["src"].Value;
                        if (src.IndexOf("//") == 0 && src.IndexOf("http") == -1)
                        {
                            src = src.Remove(0, src.IndexOf("//")).Insert(0, "http:");
                        }
                        string fileName = "";
                        string imgPath = "";
                        if (src != null && src.Contains("/"))
                        {
                            if (src.IndexOf("data:image") == 0)
                                fileName = blog.id + "_" + CommonHelper.MD5Encrypt(src) + "." + src.Split(';')[0].Split('/')[1];
                            else
                                fileName = src.Substring(src.LastIndexOf("/") + 1);

                            imgPath = filePath.Replace(userPath, "..") + "\\" + fileName;
                            imgNodes[i].Attributes["src"].Value = imgPath;
                            byte[] imgByte = CommonHelper.GetRequestByteArr(src);
                            if (imgByte != null)
                            {
                                FileHelper.SaveImage(filePath + "\\" + fileName, imgByte);
                            }
                            else
                            {
                                ShowLog(msgTitle + $"下載圖片失敗!   下載URL:{src}", true);
                                ShowLog(msgTitle + $"下載圖片失敗詳細博客:【id:{blog.id};title:{blog.title};url:{blog.link};】");
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        ShowErrorLog(msgTitle + " SaveImage 方法發生異常:" + ex.Message, true);
                    }

                }
                blog.body = doc.DocumentNode.InnerHtml;
            }
        }

        /// <summary>
        /// 下載
        /// </summary>
        /// <param name="dic"></param>
        /// <param name="uname"></param>
        /// <param name="useTime"></param>
        /// <returns></returns>
        public static int SaveBlogDetail(List<BlogInfo> blogs, string uname, out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            string msgTitle = "第3階段:";
            int countFlag = 0;
            foreach (var item in blogs)
            {

                countFlag++;
                try
                {
                    ShowLog(string.Format(msgTitle + "【{0}/{1}】開始處理【{2}】", countFlag, blogs.Count, item.title), true);
                    var filePath = blogPath + "\\" + FileHelper.FilterInvalidChar(item.title, "_") + ".html";

                    //處理圖片
                    SaveImage(item, imgPath, uname);

                    //去掉a標籤
                    //var aNodes = doc.DocumentNode.SelectNodes("//a");
                    //if (aNodes != null && aNodes.Count > 0)
                    //{
                    //    for (int a = 0; a < aNodes.Count; a++)
                    //    {
                    //        if (aNodes[a].Attributes["href"] != null && aNodes[a].Attributes["href"].Value != "#")
                    //        {
                    //            aNodes[a].Attributes["href"].Value = "javascript:void()";
                    //        }
                    //    }
                    //}

                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(htmlTemp);
                    doc.DocumentNode.SelectSingleNode("/html/head/title").InnerHtml = item.title;
                    var n1 = HtmlNode.CreateNode("<div id='div_head'><h1>" + item.title + "</h1><br />" +
                        "【發佈:" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "" +
                        "【閱讀數:" + item.views + "" +
                        "【評論數:" + item.comments + "" +
                        "<a href='" + item.link + "' target='_blank' >閱讀原文</a>" +
                        "</div>");
                    var n2 = HtmlNode.CreateNode("<div id='div_body'>" + item.body + "</div>");
                    doc.DocumentNode.SelectSingleNode("/html/body").AppendChild(n1);
                    doc.DocumentNode.SelectSingleNode("/html/body").AppendChild(n2);
                    doc.Save(filePath, Encoding.UTF8);
                    ShowLog(msgTitle + string.Format("【{0}/{1}】處理文章【{2}】完畢", countFlag, blogs.Count, item.title), true);
                }
                catch (Exception ex)
                {
                    string errorMsg = DateTime.Now.ToString("yyyyMMdd HH:mm:ss") + "\r\n" + "url=" + item.link + "\r\n" + "title=" + item.title + "\r\n" + "errorMsg=" + ex.Message + "\r\n" + "stackTrace=" + ex.StackTrace + "\r\n\r\n\r\n";
                    ShowErrorLog(msgTitle + $"error>>處理文章【{item.title}】出現錯誤:{ex.Message}" + Environment.NewLine + errorMsg, true);
                }
            }
            sw.Start();
            useTime = sw.ElapsedMilliseconds;
            return countFlag;
        }


        #endregion

        private static void ShowLog(string msg, bool isSaveLog = false)
        {
            Console.WriteLine(msg);
            if (isSaveLog)
                FileHelper.SaveTxtFile(userPath + "\\Log.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  " + msg + Environment.NewLine, false);
        }
        private static void ShowErrorLog(string msg, bool isSaveLog = false)
        {
            Console.WriteLine(msg);
            if (isSaveLog)
                FileHelper.SaveTxtFile(userPath + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  " + msg + Environment.NewLine, false);
        }

    }
}
View Code

 

執行試試吧mvc

這裏只是給個代碼輪廓和思想,代碼中一樣沒有考慮多線程、內存資源的釋放等問題,不免不會有異常的錯誤,後續繼續優化。app

相關文章
相關標籤/搜索