/// <summary> /// 經過連接獲取網頁源碼 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetContenFrommUrl(string url) { string Content = string.Empty; Uri uri = new Uri(url); //WebRequest須要添加引用 System.Net; WebRequest myReq = WebRequest.Create(uri); WebResponse result = myReq.GetResponse(); Stream receviceStream = result.GetResponseStream(); //Encoding.UTF8 //StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312")); StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.UTF8); Content = readerOfStream.ReadToEnd(); readerOfStream.Close(); receviceStream.Close(); result.Close(); return Content; } /// <summary> /// 獲取指定DIV的內容 /// </summary> /// <param name="strHTML">被篩選的字符串</param> /// <param name="name">ID名</param> /// <returns></returns> public static string GetDivFromStr(string strHTML) { string Content = string.Empty; //Match,Regex須要添加引用 System.Text.RegularExpressions; Match m = Regex.Match(strHTML, @"<div[^>]*?id=""listLeft""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase); if (m.Success) { Content = m.Value; } return Content; } /// <summary> /// 下載圖片,並將圖片保存到本地 /// </summary> /// <param name="URL">圖片連接</param> /// <returns>本地圖片地址</returns> public static string DowmLoadImage(string URL) { string Image = string.Empty; string Path = "D:/MyJob/HtmlToData/Images/"; //WebClient須要添加引用 System.Net; WebClient myWebClient = new System.Net.WebClient(); //URL 圖片路徑, Path + System.IO.Path.GetFileName(URL) 圖片保存位置 myWebClient.DownloadFile(URL, Path + System.IO.Path.GetFileName(URL)); Image = "2016/12/22/" + System.IO.Path.GetFileName(URL); return Image; } /// <summary> /// 替換指定圖片 /// </summary> /// <param name="Content">Html代碼</param> /// <returns>返回替換後的Html代碼</returns> public static string ReplaceImage(string Content) { //獲取圖片路徑 //Regex須要添加引用 System.Text.RegularExpressions; Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); //MatchCollection 須要添加引用 System.Text.RegularExpressions; MatchCollection matches = regImg.Matches(Content); //將某一特定圖片(橫槓槓)替換爲<hr /> foreach (Match match in matches) { if (match.Groups["imgUrl"].Value == "http://en.shio.gov.cn/file/images/split-e5.gif") { Content = Content.Replace(match.Value, "<hr />"); //將圖片http://en.shio.gov.cn/file/images/split-e5.gif替換爲<hr /> break; } } return Content; } /// <summary> /// 替換指定Div /// </summary> /// <param name="Content">Html代碼</param> /// <param name="strHTML">被篩選的字符串</param> /// <returns>返回替換後的Html代碼</returns> public static string ReplaceDiv(string Content,string strHTML) { //將< div id = "pages" ></div>中的內容替換爲<hr /> //Match,Regex須要添加引用 System.Text.RegularExpressions; Match mm = Regex.Match(strHTML, @"<div[^>]*?id=""pages""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase); Content = Content.Replace(mm.Value, "<hr />"); return Content; } /// <summary> /// 獲取指定imge標籤的src /// </summary> /// <param name="strHTML"></param> /// <returns></returns> public static string GetImageSrc(string strHTML) { string Titleimage = ""; //Match,Regex須要添加引用 System.Text.RegularExpressions; Match maimage = Regex.Match(strHTML, @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"); //獲取標題圖片 if (maimage.Success) { Titleimage = DowmLoadImage(maimage.Groups["imgUrl"].Value); } return Titleimage; } /// <summary> ///獲取<a> 標籤的href和內容 /// </summary> /// <param name="AStr">Html代碼</param> /// <returns></returns> public static string[] GetHref(string AStr) { string[] ListStr = new string[2]; //Match,Regex須要添加引用 System.Text.RegularExpressions; Match ma = Regex.Match(AStr, @"(?is)<a[^>]+?href=(['""])([^'""]*)\1[^>]*>(.+)</a>"); if (ma.Success) { ListStr[0] = ma.Groups[3].Value;//text ListStr[1] = ma.Groups[2].Value;//超連接 } return ListStr; } /// <summary> /// 獲取指定p(<p class="auxiInfo">)標籤的內容 /// </summary> /// <param name="PStr">Html代碼</param> /// <returns>返回P標籤的內容</returns> public static string GetTargetPContent(string PStr) { string content = ""; //Match,Regex須要添加引用 System.Text.RegularExpressions; Match mtime = Regex.Match(PStr, @"<p[^>]*?class=""auxiInfo""[^>]*>((?>(?<o><p[^>]*>)|(?<-o></p>)|(?:(?!</?p)[\s\S]))*)(?(o)(?!))</p>", RegexOptions.IgnoreCase); if (mtime.Success) { content = mtime.Groups[1].Value; } return content; } /// <summary> /// 獲取P標籤的內容 /// </summary> /// <param name="PStr">Html代碼</param> /// <returns>返回P標籤的內容</returns> public static string GetPContent(string PStr) { string content = ""; //Match,Regex須要添加引用 System.Text.RegularExpressions; Match mp = Regex.Match(PStr, @"(?is)<p>(.*?)</p>"); if (mp.Success) { content = mp.Groups[1].Value; } return content; }