能夠寫成這樣html
string strRegex = @"[\u4e00-\u9fa5]|[\(\)\《\》\——\;\,\。\「\」\<\>\!]";
其中前半部分表示匹配中文字符,後半部分爲須要匹配的標點符號。web
另,url
對於html源碼的處理,建議使用HtmlAgilityPack,用下面的代碼去掉其中的腳本、樣式或者註釋內容。spa
public static HtmlDocument InitializeHtmlDoc(string htmlString) { if (string.IsNullOrEmpty(htmlString)) { return null; } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlString); doc.DocumentNode.Descendants().Where(n => n.Name == "script" || n.Name == "style" || n.Name == "#comment").ToList().ForEach(n => n.Remove()); return doc; }
HtmlAgilityPack是使用XPath語法,"//comment()"在XPath中表示「全部註釋節點」,「#comment」很差用的話須要替換。http://www.cnblogs.com/rupeng/archive/2012/02/07/2342012.htmlcode
從Url讀取網頁內容(靜態),能夠用下面的代碼htm
public static string GetHtmlStr(string url) { if (string.IsNullOrEmpty(url)) { return string.Empty; } string html = string.Empty; try { WebRequest webRequest = WebRequest.Create(url); webRequest.Timeout = 30 * 1000; using (WebResponse webResponse = webRequest.GetResponse()) { if (((HttpWebResponse)webResponse).StatusCode == HttpStatusCode.OK) { Stream stream = webResponse.GetResponseStream(); string coder = ((HttpWebResponse)webResponse).CharacterSet; StreamReader reader = new StreamReader(stream, string.IsNullOrEmpty(coder) ? Encoding.Default : Encoding.GetEncoding(coder)); html = reader.ReadToEnd(); } } } catch (Exception ex) { //Request may timeout sometimes } return html; }