經常使用技能(更新ing):http://www.cnblogs.com/dunitian/p/4822808.html#skillhtml
技能總綱(更新ing):http://www.cnblogs.com/dunitian/p/5493793.htmlgit
在線演示:http://cppjieba-webdemo.herokuapp.comgithub
完整demo:https://github.com/dunitian/TempCode/tree/master/2016-09-05web
逆天修改版:https://github.com/dunitian/TempCode/blob/master/2016-09-05/jieba.NET.0.38.2.zipbootstrap
先說下注意點,結巴分詞他沒有對分詞進行一次去重,咱們得本身幹這件事;字典得自行配置或者設置成輸出到bin目錄app
應用場景舉例(搜索那塊你們都知道,說點其餘的)ui
——————————————————————————————————————————————————this
言歸正傳:看一組民間統計數據:(非Net版,指的是官方版)搜索引擎
net版的IKanalyzer和盤古分詞好多年沒更新了,因此此次選擇了結巴分詞(這個名字也很符合分詞的意境~~結巴說話,是否是也是一種分詞的方式呢?)spa
下面簡單演示一下:
1.先引入包:
2.字典設置:
3.簡單封裝的幫助類:
using System.Linq; using JiebaNet.Segmenter; using System.Collections.Generic; namespace LoTLib.Word.Split { #region 分詞類型 public enum JiebaTypeEnum { /// <summary> /// 精確模式---最基礎和天然的模式,試圖將句子最精確地切開,適合文本分析 /// </summary> Default, /// <summary> /// 全模式---能夠成詞的詞語都掃描出來, 速度更快,可是不能解決歧義 /// </summary> CutAll, /// <summary> /// 搜索引擎模式---在精確模式的基礎上對長詞再次切分,提升召回率,適合用於搜索引擎分詞 /// </summary> CutForSearch, /// <summary> /// 精確模式-不帶HMM /// </summary> Other } #endregion /// <summary> /// 結巴分詞 /// </summary> public static partial class WordSplitHelper { /// <summary> /// 獲取分詞以後的字符串集合 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var jieba = new JiebaSegmenter(); switch (type) { case JiebaTypeEnum.Default: return jieba.Cut(objStr); //精確模式-帶HMM case JiebaTypeEnum.CutAll: return jieba.Cut(objStr, cutAll: true); //全模式 case JiebaTypeEnum.CutForSearch: return jieba.CutForSearch(objStr); //搜索引擎模式 default: return jieba.Cut(objStr, false, false); //精確模式-不帶HMM } } /// <summary> /// 獲取分詞以後的字符串 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var words = GetSplitWords(objStr, type); //沒結果則返回空字符串 if (words == null || words.Count() < 1) { return string.Empty; } words = words.Distinct();//有時候詞有重複的,得本身處理一下 return string.Join(",", words);//根據我的需求返回 } } }調用很簡單:
string str = "bootstrap-datetimepicker 進一步跟進~~~開始時間和結束時間的樣式顯示"; Console.WriteLine("\n精確模式-帶HMM:\n"); Console.WriteLine(str.GetSplitWordStr()); Console.WriteLine("\n全模式:\n"); Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutAll)); Console.WriteLine("\n搜索引擎模式:\n"); Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.CutForSearch)); Console.WriteLine("\n精確模式-不帶HMM:\n"); Console.WriteLine(str.GetSplitWordStr(JiebaTypeEnum.Other)); Console.ReadKey();效果:
--------------------------
有人可能會說,那內容關鍵詞提取呢?==》別急,看下面:
這種方式所對應的字典是它=》idf.txt
簡單說下Constants==》
效果:
完整幫助類(最新看github):https://github.com/dunitian/TempCode/tree/master/2016-09-05
using System.Linq; using JiebaNet.Segmenter; using System.Collections.Generic; using JiebaNet.Analyser; namespace LoTLib.Word.Split { #region 分詞類型 public enum JiebaTypeEnum { /// <summary> /// 精確模式---最基礎和天然的模式,試圖將句子最精確地切開,適合文本分析 /// </summary> Default, /// <summary> /// 全模式---能夠成詞的詞語都掃描出來, 速度更快,可是不能解決歧義 /// </summary> CutAll, /// <summary> /// 搜索引擎模式---在精確模式的基礎上對長詞再次切分,提升召回率,適合用於搜索引擎分詞 /// </summary> CutForSearch, /// <summary> /// 精確模式-不帶HMM /// </summary> Other } #endregion /// <summary> /// 結巴分詞 /// </summary> public static partial class WordSplitHelper { #region 公用系列 /// <summary> /// 獲取分詞以後的字符串集合 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static IEnumerable<string> GetSplitWords(string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var jieba = new JiebaSegmenter(); switch (type) { case JiebaTypeEnum.Default: return jieba.Cut(objStr); //精確模式-帶HMM case JiebaTypeEnum.CutAll: return jieba.Cut(objStr, cutAll: true); //全模式 case JiebaTypeEnum.CutForSearch: return jieba.CutForSearch(objStr); //搜索引擎模式 default: return jieba.Cut(objStr, false, false); //精確模式-不帶HMM } } /// <summary> /// 提取文章關鍵詞集合 /// </summary> /// <param name="objStr"></param> /// <returns></returns> public static IEnumerable<string> GetArticleKeywords(string objStr) { var idf = new TfidfExtractor(); return idf.ExtractTags(objStr, 10, Constants.NounAndVerbPos);//名詞和動詞 } /// <summary> /// 返回拼接後的字符串 /// </summary> /// <param name="words"></param> /// <returns></returns> public static string JoinKeyWords(IEnumerable<string> words) { //沒結果則返回空字符串 if (words == null || words.Count() < 1) { return string.Empty; } words = words.Distinct();//有時候詞有重複的,得本身處理一下 return string.Join(",", words);//根據我的需求返回 } #endregion #region 擴展相關 /// <summary> /// 獲取分詞以後的字符串 /// </summary> /// <param name="objStr"></param> /// <param name="type"></param> /// <returns></returns> public static string GetSplitWordStr(this string objStr, JiebaTypeEnum type = JiebaTypeEnum.Default) { var words = GetSplitWords(objStr, type); return JoinKeyWords(words); } /// <summary> /// 提取文章關鍵詞字符串 /// </summary> /// <param name="objStr"></param> /// <returns></returns> public static string GetArticleKeywordStr(this string objStr) { var words = GetArticleKeywords(objStr); return JoinKeyWords(words); } #endregion } }
還有耐心或者只看末尾的有福了~
web端的字典配置那是個煩啊,逆天把源碼微調了下
使用方法和上面同樣
web版演示:
結巴中文分詞相關:
https://github.com/fxsjy/jieba