SupportYun當前情況:javascript
博主的SupportYun系統基本已經完成第一階段預期的底層服務開發啦~~~本身小嘚瑟一下。html
有對該系統歷史背景與功能等不明白的可先看該系列的第1/2篇文章:java
1.記一次企業級爬蟲系統升級改造(一)node
2.記一次企業級爬蟲系統升級改造(二):基於AngleSharp實現的抓取服務git
再貼一次博主對這個系統的簡要總體規劃圖:程序員
博主第一階段主要會作獨立的爬蟲服務+數據規則化引擎以及內容歸類處理這一塊。github
再簡單粗暴一點就是把大量相似下圖的網頁抓取,而後分析數據,獲得活動城市、活動地址、熱線、活動日期、活動分類以及內容純文本的數據提供給業務部門。 正則表達式
本篇文章將主要講博主是怎麼在抓取到網頁數據後作通用數據規則處理的。算法
先來看看當前的項目代碼結構圖:數據庫
不錯,你沒有看錯,短短三週時間,博主一我的已經寫了這麼多代碼了~~想一想手都累啊!心也累~
其實,每一天的生活狀態,就決定了五年甚至十年後的狀態,日子是一天一天過的,積累的多了,就是一輩子。
惟美的偶遇:JieBa.Net
博主首先想實現的一個功能是,分析文章或活動文本的內容語義,將其按咱們設定的幾大模塊分類。
具體怎麼從抓取的html片斷中提取純文本,博主將在下面講到。
好比說這麼一篇文章:爲何美國精英中學距離中國孩子愈來愈遠?
咱們一讀或者只看title,基本就能肯定,它應該屬於【海外教育】這個大類,固然,也能夠牽強的給他一個【海外生活】的標籤。這是咱們人工很容易處理的邏輯。那麼要是爬取到大量的這種網頁數據,咱們的代碼應該怎麼準確的來進行歸類處理呢?
來看看博主的多番思考後的最終實現方案:
1.基礎規則化服務涉及到數據清理等內容,後面介紹。
2.xml規則集是樓主找業務部門以及本身整理的一個數據集,存儲各大類別的主要熱門、經典關鍵詞。
3.關鍵詞提取算法,這兒博主分析了TF-IDF算法與TextRank算法,最終採用的TextRank算法,基於JieBa.Net實現。
先來講說爲何要是用JieBa分詞吧。
博主的下一階段就會涉及到全文索引,必然會引用分詞技術,之前都是輕車熟路的使用Lucene.Net+盤古分詞,代碼都不用多寫...畢竟存貨多多!
早說了,這一次打算吃點素的,不能老啃老本...咱們是軟件工程師,不是碼農!!!
鬼使神差的就搜到了JieBa.Net,還發現幾個bug,提了GitHub...說來也是心累,可是做者開源也是多辛苦的,得多多支持!
因爲當時做者不能及時的更新NuGet上的包,因此本身本機改了一些就開始用了。
1 /// <summary> 2 /// 關鍵詞提取 3 /// </summary> 4 public class KeyWordExtractor 5 { 6 /// <summary> 7 /// TF-IDF算法 關鍵詞提取 8 /// </summary> 9 /// <returns></returns> 10 public List<string> TFIDFExtractor(string content, int count = 10) 11 { 12 var extractor = new TFIDFKeywordExtractor(); 13 var keywords = extractor.ExtractTags(content, count, Constants.NounAndVerbPos); 14 return keywords.ToList(); 15 } 16 17 /// <summary> 18 /// TextRank算法 關鍵詞提取 19 /// </summary> 20 /// <returns></returns> 21 public List<string> TextRankExtractor(string content, int count = 10) 22 { 23 var extractor = new TextRankKeywordExtractor(); 24 var keywords = extractor.ExtractTags(content, count, Constants.NounAndVerbPos); 25 return keywords.ToList(); 26 } 27 }
第12與23行原本直接使用JieBa封裝的方法就能夠的,上面說了因爲NuGet未能及時更新,本身把有bug的邏輯實現了一遍:
1 /// <summary> 2 /// TF-IDF算法 關鍵詞提取器 3 /// </summary> 4 public class TFIDFKeywordExtractor : BaseKeywordExtractor 5 { 6 private static readonly string DefaultIdfFile = Path.Combine(ConfigurationManager.AppSettings["JiebaConfigFileDir"] ?? HttpContext.Current.Server.MapPath("/Resources/"), "idf.txt"); 7 private static readonly int DefaultWordCount = 20; 8 9 private JiebaSegmenter Segmenter { get; set; } 10 private PosSegmenter PosSegmenter { get; set; } 11 private IDFLoader Loader { get; set; } 12 13 private IDictionary<string, double> IdfFreq { get; set; } 14 private double MedianIdf { get; set; } 15 16 public TFIDFKeywordExtractor(JiebaSegmenter segmenter = null) 17 { 18 Segmenter = segmenter.IsNull() ? new JiebaSegmenter() : segmenter; 19 PosSegmenter = new PosSegmenter(Segmenter); 20 SetStopWords(StopWordsIdfFile); 21 if (StopWords.IsEmpty()) 22 { 23 StopWords.UnionWith(DefaultStopWords); 24 } 25 26 Loader = new IDFLoader(DefaultIdfFile); 27 IdfFreq = Loader.IdfFreq; 28 MedianIdf = Loader.MedianIdf; 29 } 30 31 public void SetIdfPath(string idfPath) 32 { 33 Loader.SetNewPath(idfPath); 34 IdfFreq = Loader.IdfFreq; 35 MedianIdf = Loader.MedianIdf; 36 } 37 38 private IEnumerable<string> FilterCutByPos(string text, IEnumerable<string> allowPos) 39 { 40 var posTags = PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag)); 41 return posTags.Select(p => p.Word); 42 } 43 44 private IDictionary<string, double> GetWordIfidf(string text, IEnumerable<string> allowPos) 45 { 46 IEnumerable<string> words = null; 47 if (allowPos.IsNotEmpty()) 48 { 49 words = FilterCutByPos(text, allowPos); 50 } 51 else 52 { 53 words = Segmenter.Cut(text); 54 } 55 56 var freq = new Dictionary<string, double>(); 57 foreach (var word in words) 58 { 59 var w = word; 60 if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower())) 61 { 62 continue; 63 } 64 freq[w] = freq.GetDefault(w, 0.0) + 1.0; 65 } 66 var total = freq.Values.Sum(); 67 foreach (var k in freq.Keys.ToList()) 68 { 69 freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total; 70 } 71 72 return freq; 73 } 74 75 public override IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null) 76 { 77 if (count <= 0) { count = DefaultWordCount; } 78 79 var freq = GetWordIfidf(text, allowPos); 80 return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count); 81 } 82 83 public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null) 84 { 85 if (count <= 0) { count = DefaultWordCount; } 86 87 var freq = GetWordIfidf(text, allowPos); 88 return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair() 89 { 90 Word = p.Key, 91 Weight = p.Value 92 }).Take(count); 93 } 94 }
1 /// <summary> 2 /// TextRank算法 關鍵詞提取器 3 /// </summary> 4 public class TextRankKeywordExtractor : BaseKeywordExtractor 5 { 6 private static readonly IEnumerable<string> DefaultPosFilter = new List<string>() 7 { 8 "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "v", "vd", "vg", "vi", "vn", "vq" 9 }; 10 11 private JiebaSegmenter Segmenter { get; set; } 12 private PosSegmenter PosSegmenter { get; set; } 13 14 public int Span { get; set; } 15 16 public bool PairFilter(Pair wp) 17 { 18 return DefaultPosFilter.Contains(wp.Flag) 19 && wp.Word.Trim().Length >= 2 20 && !StopWords.Contains(wp.Word.ToLower()); 21 } 22 23 public TextRankKeywordExtractor() 24 { 25 Span = 5; 26 27 Segmenter = new JiebaSegmenter(); 28 PosSegmenter = new PosSegmenter(Segmenter); 29 SetStopWords(StopWordsIdfFile); 30 if (StopWords.IsEmpty()) 31 { 32 StopWords.UnionWith(DefaultStopWords); 33 } 34 } 35 36 public override IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null) 37 { 38 var rank = ExtractTagRank(text, allowPos); 39 if (count <= 0) { count = 20; } 40 return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count); 41 } 42 43 public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null) 44 { 45 var rank = ExtractTagRank(text, allowPos); 46 if (count <= 0) { count = 20; } 47 return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair() 48 { 49 Word = p.Key, Weight = p.Value 50 }).Take(count); 51 } 52 53 #region Private Helpers 54 55 private IDictionary<string, double> ExtractTagRank(string text, IEnumerable<string> allowPos) 56 { 57 if (allowPos.IsEmpty()) 58 { 59 allowPos = DefaultPosFilter; 60 } 61 62 var g = new UndirectWeightedGraph(); 63 var cm = new Dictionary<string, int>(); 64 var words = PosSegmenter.Cut(text).ToList(); 65 66 for (var i = 0; i < words.Count(); i++) 67 { 68 var wp = words[i]; 69 if (PairFilter(wp)) 70 { 71 for (var j = i + 1; j < i + Span; j++) 72 { 73 if (j >= words.Count) 74 { 75 break; 76 } 77 if (!PairFilter(words[j])) 78 { 79 continue; 80 } 81 82 var key = wp.Word + "$" + words[j].Word; 83 if (!cm.ContainsKey(key)) 84 { 85 cm[key] = 0; 86 } 87 cm[key] += 1; 88 } 89 } 90 } 91 92 foreach (var p in cm) 93 { 94 var terms = p.Key.Split('$'); 95 g.AddEdge(terms[0], terms[1], p.Value); 96 } 97 98 return g.Rank(); 99 } 100 101 #endregion 102 }
1 /// <summary> 2 /// 關鍵詞提取器基類 3 /// 基於JieBa分詞 4 /// </summary> 5 public abstract class BaseKeywordExtractor 6 { 7 protected static readonly string StopWordsIdfFile = Path.Combine(ConfigurationManager.AppSettings["JiebaConfigFileDir"] ?? HttpContext.Current.Server.MapPath("/Resources/"), "stopwords.txt"); 8 9 protected static readonly List<string> DefaultStopWords = new List<string>() 10 { 11 "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", 12 "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it", 13 "this", "then", "at", "have", "all", "not", "one", "has", "or", "that" 14 }; 15 16 protected virtual ISet<string> StopWords { get; set; } 17 18 public void SetStopWords(string stopWordsFile) 19 { 20 StopWords = new HashSet<string>(); 21 var path = Path.GetFullPath(stopWordsFile); 22 if (File.Exists(path)) 23 { 24 var lines = File.ReadAllLines(path); 25 foreach (var line in lines) 26 { 27 StopWords.Add(line.Trim()); 28 } 29 } 30 } 31 32 public abstract IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null); 33 public abstract IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null); 34 }
絕大部分都是JieBa.NET的源碼,你們也能夠去github支持一下做者~~~
做爲一枚北漂,若是要選擇平平淡淡的生活,確實不如早點回老家謀求一份穩定的飯碗,還能常伴家人左右。
細說數據規則化引擎服務:
1.建立一個windows服務,設置5分鐘自動輪詢啓動執行數據規則化引擎。
1 public partial class Service1 : ServiceBase 2 { 3 private RuleEngineService.RuleEngineService ruleEngineService = new RuleEngineService.RuleEngineService(); 4 5 public Service1() 6 { 7 InitializeComponent(); 8 } 9 10 protected override void OnStart(string[] args) 11 { 12 try 13 { 14 EventLog.WriteEntry("【Support雲數據規則化引擎服務啓動】"); 15 CommonTools.WriteLog("【Support雲數據規則化引擎服務啓動】"); 16 17 XmlConfigurator.Configure(); 18 19 Timer timer = new Timer(); 20 // 循環間隔時間(默認5分鐘) 21 timer.Interval = StringHelper.StrToInt(ConfigurationManager.AppSettings["TimerInterval"].ToString(), 300) * 1000; 22 // 容許Timer執行 23 timer.Enabled = true; 24 // 定義回調 25 timer.Elapsed += new ElapsedEventHandler(TimedTask); 26 // 定義屢次循環 27 timer.AutoReset = true; 28 } 29 catch (Exception ex) 30 { 31 CommonTools.WriteLog("【服務運行 OnStart:Error" + ex + "】"); 32 } 33 } 34 35 private void TimedTask(object source, ElapsedEventArgs e) 36 { 37 System.Threading.ThreadPool.QueueUserWorkItem(delegate 38 { 39 ruleEngineService.Main(); 40 }); 41 } 42 43 protected override void OnStop() 44 { 45 CommonTools.WriteLog(("【Support雲數據規則化引擎服務中止】")); 46 EventLog.WriteEntry("【Support雲數據規則化引擎服務中止】"); 47 } 48 }
具體部署方案很少說,不少園友都有發相關文章。
2.數據規則化解析服務引擎RuleEngineService,主要是分類別(文章、活動)判斷是否有未處理數據,若有則調取相應服務開始處理。
1 /// <summary> 2 /// 數據規則化解析服務引擎 3 /// </summary> 4 public class RuleEngineService 5 { 6 private readonly ArticleRuleEngineService articleRuleEngineService = new ArticleRuleEngineService(); 7 private readonly ActivityRuleEngineService activityRuleEngineService = new ActivityRuleEngineService(); 8 9 public void Main() 10 { 11 try 12 { 13 HandleArticleData(); 14 HandleActivityData(); 15 } 16 catch (Exception ex) 17 { 18 LogUtils.ErrorLog(ex); 19 } 20 } 21 22 /// <summary> 23 /// 處理文章類別數據 24 /// </summary> 25 private void HandleArticleData() 26 { 27 using (var context = new SupportYunDBContext()) 28 { 29 if ( 30 context.CollectionInitialData.Any( 31 t => 32 !t.IsDelete && t.ProcessingProgress != ProcessingProgress.已處理 && 33 t.CollectionType == CollectionType.文章)) 34 { 35 var articleDatas = context.CollectionInitialData.Where( 36 t => 37 !t.IsDelete && t.ProcessingProgress != ProcessingProgress.已處理 && 38 t.CollectionType == CollectionType.文章).ToList(); 39 foreach (var article in articleDatas) 40 { 41 articleRuleEngineService.RuleArticle(article.Id); 42 } 43 } 44 } 45 } 46 47 /// <summary> 48 /// 處理活動類別數據 49 /// </summary> 50 private void HandleActivityData() 51 { 52 using (var context = new SupportYunDBContext()) 53 { 54 if ( 55 context.CollectionInitialData.Any( 56 t => 57 !t.IsDelete && t.ProcessingProgress != ProcessingProgress.已處理 && 58 t.CollectionType == CollectionType.活動)) 59 { 60 var activityDatas = context.CollectionInitialData.Where( 61 t => 62 !t.IsDelete && t.ProcessingProgress != ProcessingProgress.已處理 && 63 t.CollectionType == CollectionType.活動).ToList(); 64 foreach (var activity in activityDatas) 65 { 66 activityRuleEngineService.RuleActivity(activity.Id); 67 } 68 } 69 } 70 } 71 }
3.咱們首先來看文章的規則化處理類ArticleRuleEngineService,核心邏輯就是:
a)查詢未處理的抓取數據(此時內容仍是html片斷)
b)清洗內容中的html標籤,得到純文本數據(博主特喜歡xml配置文件的方式,看過博主前面幾個系列的都知道,因此博主的方案是:配置一個xml文件,分html經常使用標籤節點、經常使用分隔符節點、特殊如style/script等不規則節點的枚舉正則表達式集合,而後代碼獲取xml內容,正則表達式一匹配替換,就能獲得純文本數據。若是後期發現有未配置的節點,直接在xml添加就ok,簡單又方便)。
1 <?xml version="1.0" encoding="utf-8" ?> 2 <filterStr> 3 <HtmlLabels> 4 <element><![CDATA[&#[^>]*;]]></element> 5 <element><![CDATA[</?marquee[^>]*>]]></element> 6 <element><![CDATA[</?object[^>]*>]]></element> 7 <element><![CDATA[</?param[^>]*>]]></element> 8 <element><![CDATA[</?embed[^>]*>]]></element> 9 <element><![CDATA[</?table[^>]*>]]></element> 10 <element><![CDATA[</?tbody[^>]*>]]></element> 11 <element><![CDATA[</?tr[^>]*>]]></element> 12 <element><![CDATA[</?th[^>]*>]]></element> 13 <element><![CDATA[</?td[^>]*>]]></element> 14 <element><![CDATA[</?h1[^>]*>]]></element> 15 <element><![CDATA[</?h2[^>]*>]]></element> 16 <element><![CDATA[</?h3[^>]*>]]></element> 17 <element><![CDATA[</?h4[^>]*>]]></element> 18 <element><![CDATA[</?h5[^>]*>]]></element> 19 <element><![CDATA[</?h6[^>]*>]]></element> 20 <element><![CDATA[</?p[^>]*>]]></element> 21 <element><![CDATA[</?a[^>]*>]]></element> 22 <element><![CDATA[</?img[^>]*>]]></element> 23 <element><![CDATA[</?li[^>]*>]]></element> 24 <element><![CDATA[</?span[^>]*>]]></element> 25 <element><![CDATA[</?div[^>]*>]]></element> 26 <element><![CDATA[</?font[^>]*>]]></element> 27 <element><![CDATA[</?b[^>]*>]]></element> 28 <element><![CDATA[</?u[^>]*>]]></element> 29 <element><![CDATA[</?i[^>]*>]]></element> 30 <element><![CDATA[</?strong[^>]*>]]></element> 31 <element><![CDATA[</?hr[^>]*>]]></element> 32 <element><![CDATA[</?title[^>]*>]]></element> 33 <element><![CDATA[</?form[^>]*>]]></element> 34 <element><![CDATA[</?em[^>]*>]]></element> 35 <element><![CDATA[</?dfn[^>]*>]]></element> 36 <element><![CDATA[</?ins[^>]*>]]></element> 37 <element><![CDATA[</?strike[^>]*>]]></element> 38 <element><![CDATA[</?s[^>]*>]]></element> 39 <element><![CDATA[</?del[^>]*>]]></element> 40 <element><![CDATA[</?tt[^>]*>]]></element> 41 <element><![CDATA[</?xmp[^>]*>]]></element> 42 <element><![CDATA[</?plaintext[^>]*>]]></element> 43 <element><![CDATA[</?listing[^>]*>]]></element> 44 <element><![CDATA[</?center[^>]*>]]></element> 45 <element><![CDATA[</?base[^>]*>]]></element> 46 <element><![CDATA[</?bgsound[^>]*>]]></element> 47 <element><![CDATA[</?frameset[^>]*>]]></element> 48 <element><![CDATA[</?body[^>]*>]]></element> 49 <element><![CDATA[</?dd[^>]*>]]></element> 50 <element><![CDATA[</?dl[^>]*>]]></element> 51 <element><![CDATA[</?dt[^>]*>]]></element> 52 <element><![CDATA[</?frame[^>]*>]]></element> 53 <element><![CDATA[</?input[^>]*>]]></element> 54 <element><![CDATA[</?ol[^>]*>]]></element> 55 <element><![CDATA[</?select[^>]*>]]></element> 56 <element><![CDATA[</?option[^>]*>]]></element> 57 <element><![CDATA[</?pre[^>]*>]]></element> 58 <element><![CDATA[</?small[^>]*>]]></element> 59 <element><![CDATA[</?textarea[^>]*>]]></element> 60 <element><![CDATA[</?button[^>]*>]]></element> 61 <element><![CDATA[</?o:p[^>]*>]]></element> 62 </HtmlLabels> 63 <Separators> 64 <element><![CDATA[ ]]></element> 65 <element><![CDATA["]]></element> 66 <element><![CDATA[&]]></element> 67 <element><![CDATA[<]]></element> 68 <element><![CDATA[>]]></element> 69 <element><![CDATA[\n]]></element> 70 <element><![CDATA[\t]]></element> 71 <element><![CDATA[\r]]></element> 72 <element><![CDATA[<br />]]></element> 73 <element><![CDATA[<br/>]]></element> 74 <element><![CDATA[<br>]]></element> 75 <element><![CDATA[<header>]]></element> 76 </Separators> 77 <Other> 78 <element><![CDATA[</?script[^>]*>]]></element> 79 <element><![CDATA[(javascript|jscript|vbscript|vbs):]]></element> 80 <element><![CDATA[on(mouse|exit|error|click|key)]]></element> 81 <element><![CDATA[<\\?xml[^>]*>]]></element> 82 <element><![CDATA[<\\/?[a-z]+:[^>]*>]]></element> 83 <element><![CDATA[<(style)>.*?</\1>]]></element> 84 <element><![CDATA[<!--.*?-->]]></element> 85 </Other> 86 </filterStr>
這是博主整理的清理html標籤獲取純文本的xml文件,目前對博主來講抓取的網頁都是能夠完美清理的。
博主在系統中大量使用了xml配置文件的方式,因爲xml文件通常改動較小,故爲他們的讀取數據作了緩存處理。
c)先來看博主的通用緩存類
1 public class MemoryCache<TK, TV> 2 { 3 private readonly ObjectCache _memoryCache; 4 public static MemoryCache<TK, TV> Instance 5 { 6 get { return SingletonCacheProvider<MemoryCache<TK, TV>>.Instance; } 7 } 8 9 public MemoryCache() : this(null) { } 10 public MemoryCache(string name) 11 { 12 _memoryCache = new MemoryCache(string.Format("{0}_{1}_{2}", typeof(TK).Name, typeof(TV).Name, name)); 13 } 14 15 public TV Get(TK cacheKey) 16 { 17 if (_memoryCache.Contains(ParseKey(cacheKey))) 18 { 19 return (TV)_memoryCache[ParseKey(cacheKey)]; 20 } 21 else 22 { 23 return default(TV); 24 } 25 } 26 27 public void Set(TK cacheKey, TV cacheValues, TimeSpan timeSpan) 28 { 29 _memoryCache.Set(ParseKey(cacheKey), cacheValues, new DateTimeOffset(DateTime.UtcNow + timeSpan)); 30 } 31 32 public void Remove(TK cacheKey) 33 { 34 _memoryCache.Remove(ParseKey(cacheKey)); 35 } 36 37 private string ParseKey(TK key) 38 { 39 return key.GetHashCode().ToString(); 40 } 41 }
全部須要使用緩存的模板,須要繼承實現下面的緩存接口:
1 /// <summary> 2 /// 緩存接口 3 /// 全部具象緩存類繼承 4 /// </summary> 5 /// <typeparam name="TK">緩存鍵 類別</typeparam> 6 /// <typeparam name="TV">緩存值 類別</typeparam> 7 public interface ICache<TK, TV> 8 { 9 /// <summary> 10 /// 得到緩存值 11 /// </summary> 12 /// <typeparam name="TV">值類別</typeparam> 13 /// <param name="cacheKey">緩存鍵</param> 14 /// <returns></returns> 15 TV Get(TK cacheKey); 16 17 /// <summary> 18 /// 移除緩存值 19 /// </summary> 20 /// <param name="cacheKey">緩存鍵</param> 21 void Remove(TK cacheKey); 22 }
好比咱們說到的xml獲取數據對象模塊,使用緩存的具體方案就是:
1 public class XmlModelCacheManager<TXmlModel> : ICache<string, TXmlModel> 2 { 3 private readonly string xmlPath; 4 public XmlModelCacheManager(string xmlPath) 5 { 6 this.xmlPath = xmlPath; 7 } 8 9 private readonly MemoryCache<string, TXmlModel> cacheMenager = MemoryCache<string, TXmlModel>.Instance; 10 11 public TXmlModel Get(string cacheKey) 12 { 13 var result = cacheMenager.Get(cacheKey); 14 if (result == null) 15 { 16 var xmlDoc = XMLHelper.LoadXml(xmlPath); 17 result = GetXmlValues(xmlDoc); 18 cacheMenager.Set(cacheKey, result, TimeSpan.FromDays(3)); 19 } 20 return result; 21 } 22 23 public void Remove(string cacheKey) 24 { 25 cacheMenager.Remove(cacheKey); 26 } 27 28 private TXmlModel GetXmlValues(XmlDocument xmlDoc) 29 { 30 var model = (TXmlModel)Activator.CreateInstance(typeof(TXmlModel)); 31 var properies = model.GetType().GetProperties(BindingFlags.Instance | BindingFlags.Public); 32 foreach (var property in properies) 33 { 34 if (property.PropertyType.IsGenericType) 35 { 36 var xmlNodes = xmlDoc.SelectSingleNode(@"filterStr/" + property.Name).ChildNodes; 37 if (xmlNodes.Count > 0) 38 { 39 var propListValues = (from XmlNode node in xmlNodes select node.InnerText).ToList(); 40 property.SetValue(model, propListValues, null); 41 } 42 } 43 } 44 return model; 45 } 46 }
因爲博主的xml文件對象較多,故GetXmlValues方法中使用了較爲抽象的反射機制來取值與賦值。
就以剛纔的清理html標籤獲取純文本的xmlModel爲例:
1 public class ClearHtmlXmlModel 2 { 3 private readonly string xmlFileName = "ClearHtml.xml"; 4 public ClearHtmlXmlModel(bool hasData) 5 { 6 var model = new XmlModelCacheManager<ClearHtmlXmlModel>(ConfigurationManager.AppSettings["XMLTemplatesRoute"] + xmlFileName).Get("ClearHtml"); 7 this.HtmlLabels = model.HtmlLabels; 8 this.Separators = model.Separators; 9 this.Other = model.Other; 10 } 11 12 public ClearHtmlXmlModel() 13 { 14 15 } 16 17 /// <summary> 18 /// HTML標籤 19 /// </summary> 20 public List<string> HtmlLabels { get; set; } 21 22 /// <summary> 23 /// 分隔符 24 /// </summary> 25 public List<string> Separators { get; set; } 26 27 /// <summary> 28 /// 其它特殊字符 29 /// </summary> 30 public List<string> Other { get; set; } 31 }
d)對純文本進行關鍵詞提取,再匹配配置好的分類詞庫xml文件,分析獲得該文章所屬類別。
1 /// <summary> 2 /// 分析文章所屬類別 3 /// </summary> 4 /// <returns></returns> 5 private List<ArticleType> GetArticleTypes(string content) 6 { 7 var result = new List<ArticleType>(); 8 if (string.IsNullOrEmpty(content)) 9 { 10 return result; 11 } 12 else 13 { 14 var keys = keyWordExtractor.TextRankExtractor(content, 10); 15 if (keys.Any()) 16 { 17 var articleTypeRuleXmlModel = new ArticleTypeRuleXmlModel(true); 18 if (articleTypeRuleXmlModel.OverseasEducation.Any()) 19 { 20 result.AddRange(from overseasEducation in articleTypeRuleXmlModel.OverseasEducation 21 where keys.Any(k => k.Contains(overseasEducation)) 22 select ArticleType.OverseasEducation); 23 } 24 if (articleTypeRuleXmlModel.OverseasFinancial.Any()) 25 { 26 result.AddRange(from overseasFinancial in articleTypeRuleXmlModel.OverseasFinancial 27 where keys.Any(k => k.Contains(overseasFinancial)) 28 select ArticleType.OverseasFinancial); 29 } 30 if (articleTypeRuleXmlModel.OverseasHome.Any()) 31 { 32 result.AddRange(from overseasHome in articleTypeRuleXmlModel.OverseasHome 33 where keys.Any(k => k.Contains(overseasHome)) 34 select ArticleType.OverseasHome); 35 } 36 if (articleTypeRuleXmlModel.OverseasLaw.Any()) 37 { 38 result.AddRange(from overseasLaw in articleTypeRuleXmlModel.OverseasLaw 39 where keys.Any(k => k.Contains(overseasLaw)) 40 select ArticleType.OverseasLaw); 41 } 42 if (articleTypeRuleXmlModel.OverseasLife.Any()) 43 { 44 result.AddRange(from overseasLife in articleTypeRuleXmlModel.OverseasLife 45 where keys.Any(k => k.Contains(overseasLife)) 46 select ArticleType.OverseasLife); 47 } 48 if (articleTypeRuleXmlModel.OverseasMedical.Any()) 49 { 50 result.AddRange(from overseasMedical in articleTypeRuleXmlModel.OverseasMedical 51 where keys.Any(k => k.Contains(overseasMedical)) 52 select ArticleType.OverseasMedical); 53 } 54 if (articleTypeRuleXmlModel.OverseasMigration.Any()) 55 { 56 result.AddRange(from overseasMigration in articleTypeRuleXmlModel.OverseasMigration 57 where keys.Any(k => k.Contains(overseasMigration)) 58 select ArticleType.OverseasMigration); 59 } 60 if (articleTypeRuleXmlModel.OverseasTax.Any()) 61 { 62 result.AddRange(from overseasTax in articleTypeRuleXmlModel.OverseasTax 63 where keys.Any(k => k.Contains(overseasTax)) 64 select ArticleType.OverseasTax); 65 } 66 67 return result.Distinct().ToList(); 68 } 69 else 70 { 71 return result; 72 } 73 } 74 }
e)最後修改抓取數據狀態,提交文章數據到規則化數據庫。
額~~感受本身寫得挺明白,同事卻說有點無厘頭,不太能看明白了......
故博主就先不接着往下寫了。先畫了一張總體詳細的流程圖以供梳理(規則化引擎服務後面剩餘的內容下一篇文章再詳細的寫):
把每一天都當成本身最後一天來過,明天都沒有哪還有之後啊!
這一篇文章就先到這兒~~下一篇再接着說博主具體怎麼針對活動數據作的規則化處理。
有人的地方就有程序員,有程序員的地方就有bug!
共勉!
原創文章,代碼都是從本身項目裏貼出來的。轉載請註明出處哦,親~~~