經過連接獲取Html源碼內容

/// <summary>
        /// 經過連接獲取網頁源碼
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
       public  static string GetContenFrommUrl(string url)
        {
            string Content = string.Empty;
            Uri uri = new Uri(url);
            //WebRequest須要添加引用  System.Net;
            WebRequest myReq = WebRequest.Create(uri);
            WebResponse result = myReq.GetResponse();
            Stream receviceStream = result.GetResponseStream();
            //Encoding.UTF8
            //StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312"));
            StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.UTF8);
            Content = readerOfStream.ReadToEnd();
            readerOfStream.Close();
            receviceStream.Close();
            result.Close();
            return Content;
        }

        /// <summary>
        /// 獲取指定DIV的內容
        /// </summary>
        /// <param name="strHTML">被篩選的字符串</param>
        /// <param name="name">ID名</param>
        /// <returns></returns>
        public  static string GetDivFromStr(string strHTML)
        {
            string Content = string.Empty;
            //Match,Regex須要添加引用 System.Text.RegularExpressions;
            Match m = Regex.Match(strHTML, @"<div[^>]*?id=""listLeft""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                Content = m.Value;
            }
            return Content;
        }

        /// <summary>
        /// 下載圖片,並將圖片保存到本地
        /// </summary>
        /// <param name="URL">圖片連接</param>
        /// <returns>本地圖片地址</returns>
     public    static string DowmLoadImage(string URL)
        {
            string Image = string.Empty;
            string Path = "D:/MyJob/HtmlToData/Images/";
            //WebClient須要添加引用 System.Net;
            WebClient myWebClient = new System.Net.WebClient();
            //URL 圖片路徑, Path + System.IO.Path.GetFileName(URL) 圖片保存位置
            myWebClient.DownloadFile(URL, Path + System.IO.Path.GetFileName(URL));
            Image = "2016/12/22/" + System.IO.Path.GetFileName(URL);
            return Image;
        }

        /// <summary>
        /// 替換指定圖片
        /// </summary>
        /// <param name="Content">Html代碼</param>
        /// <returns>返回替換後的Html代碼</returns>
        public static string ReplaceImage(string Content)
        {
            //獲取圖片路徑
            //Regex須要添加引用 System.Text.RegularExpressions;
            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
            //MatchCollection 須要添加引用 System.Text.RegularExpressions;
            MatchCollection matches = regImg.Matches(Content);
            //將某一特定圖片(橫槓槓)替換爲<hr />
            foreach (Match match in matches)
            {
                if (match.Groups["imgUrl"].Value == "http://en.shio.gov.cn/file/images/split-e5.gif")
                {
                    Content = Content.Replace(match.Value, "<hr />"); //將圖片http://en.shio.gov.cn/file/images/split-e5.gif替換爲<hr />
                    break;
                }
            }
            return Content;
        }

        /// <summary>
        /// 替換指定Div
        /// </summary>
        /// <param name="Content">Html代碼</param>
       /// <param name="strHTML">被篩選的字符串</param>
        /// <returns>返回替換後的Html代碼</returns>
        public static string ReplaceDiv(string Content,string strHTML)
        {
            //將< div id = "pages" ></div>中的內容替換爲<hr />
            //Match,Regex須要添加引用 System.Text.RegularExpressions;
            Match mm = Regex.Match(strHTML, @"<div[^>]*?id=""pages""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
            Content = Content.Replace(mm.Value, "<hr />");
            return Content;
        }

        /// <summary>
        /// 獲取指定imge標籤的src
        /// </summary>
        /// <param name="strHTML"></param>
        /// <returns></returns>
        public  static string GetImageSrc(string strHTML)
        {
            string Titleimage = "";
            //Match,Regex須要添加引用 System.Text.RegularExpressions;
            Match maimage = Regex.Match(strHTML, @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
            //獲取標題圖片
            if (maimage.Success)
            {
               Titleimage = DowmLoadImage(maimage.Groups["imgUrl"].Value);
            }
            return Titleimage;
        }

        /// <summary>
        ///獲取<a> 標籤的href和內容   
        /// </summary>
        /// <param name="AStr">Html代碼</param>
        /// <returns></returns>
        public static string[] GetHref(string AStr)
        {
            string[] ListStr = new string[2];
            //Match,Regex須要添加引用 System.Text.RegularExpressions;
            Match ma = Regex.Match(AStr, @"(?is)<a[^>]+?href=(['""])([^'""]*)\1[^>]*>(.+)</a>");
            if (ma.Success)
            {
                ListStr[0] = ma.Groups[3].Value;//text
                ListStr[1] = ma.Groups[2].Value;//超連接
            }
            return ListStr;
        }

        /// <summary>
        /// 獲取指定p(<p class="auxiInfo">)標籤的內容
        /// </summary>
        /// <param name="PStr">Html代碼</param>
        /// <returns>返回P標籤的內容</returns>
        public static string GetTargetPContent(string PStr)
        {
            string content = "";
            //Match,Regex須要添加引用 System.Text.RegularExpressions;
            Match mtime = Regex.Match(PStr, @"<p[^>]*?class=""auxiInfo""[^>]*>((?>(?<o><p[^>]*>)|(?<-o></p>)|(?:(?!</?p)[\s\S]))*)(?(o)(?!))</p>", RegexOptions.IgnoreCase);
            if (mtime.Success)
            {
                content = mtime.Groups[1].Value;
            }
            return content;
        }

        /// <summary>
        /// 獲取P標籤的內容
        /// </summary>
        /// <param name="PStr">Html代碼</param>
        /// <returns>返回P標籤的內容</returns>
        public static string GetPContent(string PStr)
        {
            string content = "";
            //Match,Regex須要添加引用 System.Text.RegularExpressions;
            Match mp = Regex.Match(PStr, @"(?is)<p>(.*?)</p>");
            if (mp.Success)
            {
                content = mp.Groups[1].Value;
            }
            return content;
        }
相關文章
相關標籤/搜索