這兩天在作數據採集,所以整理了下數據採集要用到的一些方法。由於我採集的數據比較簡單,因此沒有用到框架。比較有名的兩個框架 HtmlAgilityPack 和 Jumony,感興趣的能夠研究下。固然,火車頭採集工具也很方便,不過要付費。下面是整理的代碼:html
/// <summary> /// Html正則處理幫助類 /// </summary> public class HtmlRegex { /// <summary> /// 匹配全部Html標籤 /// </summary> const string HTMLALLTAG = @"<[^>]+>|</[^>]+>"; /// <summary> /// 刪除全部html標籤 /// </summary> /// <param name="content">原HTML代碼</param> /// <returns></returns> public static string RemoveAllHtml(string content) { return Regex.Replace(content, HTMLALLTAG, ""); } /// <summary> /// 根據正則匹配獲取指定內容 /// </summary> /// <param name="regStr">正則</param> /// <param name="content">原HTML代碼</param> /// <param name="hashtml">是否包含HTML標籤</param> /// <returns></returns> public static string GetStrByRegex(string regStr, string content, bool hashtml = true) { string result = string.Empty; Regex reg = new Regex(regStr); Match mth = reg.Match(content); if (mth.Success) { result = mth.Value; if (!hashtml) result = HtmlRegex.RemoveAllHtml(result); //去除html標籤 } return result; } /// <summary> /// 獲取指定位置的html代碼 /// </summary> /// <param name="start">起始字符串</param> /// <param name="end">結束字符串</param> /// <param name="content">原HTML代碼</param> /// <param name="hasHtml">是否包含HTML標籤</param> /// <returns></returns> public static string GetStrByRegex(string start, string end, string content, bool hasHtml = true) { string result = string.Empty; string regStr = @"(?is)(" + start + ").*?(" + end + ")"; Regex reg = new Regex(regStr); Match mth = reg.Match(content); if (mth.Success) { result = mth.Value; if (!hasHtml) result = HtmlRegex.RemoveAllHtml(result); //去除html標籤 } return result; } /// <summary> /// 獲取匹配的字符串列表 /// </summary> /// <param name="regStr">正則</param> /// <param name="content">原HTML代碼</param> /// <returns></returns> public static List<string> GetStrListByRegex(string regStr, string content) { List<string> strList = null; MatchCollection mc = null; try { Regex reg = new Regex(regStr); mc = reg.Matches(content); if (mc.Count > 0) { strList = new List<string>(); for (int i = 0; i < mc.Count; i++) { strList.Add(mc[i].Value); } } } catch { strList = null; } return strList; } /// <summary> /// 獲取匹配的字符串列表 /// </summary> /// <param name="start">起始字符串</param> /// <param name="end">結束字符串</param> /// <param name="content">原HTML代碼</param> /// <returns></returns> public static List<string> GetStrListByRegex(string start, string end, string content) { List<string> strList = null; MatchCollection mc = null; string regStr = @"(?is)(" + start + ").*?(" + end + ")"; try { Regex reg = new Regex(regStr); mc = reg.Matches(content); if (mc.Count > 0) { strList = new List<string>(); for (int i = 0; i < mc.Count; i++) { strList.Add(mc[i].Value); } } } catch { strList = null; } return strList; } }