實現效果:git
上一篇文章有附全文搜索結果的設計圖,下面截一張開發完成上線後的實圖:程序員
基本風格是模仿的百度搜索結果,綠色的分頁略顯小清新。github
目前已採集並建立索引的文章約3W多篇,索引文件不算太大,查詢速度很是棒。架構
刀不磨要生鏽,人不學要落後。天天都要學一些新東西。 app
基本技術介紹:ide
還記得上一次作全文搜索是在2013年,主要核心設計與代碼均是當時的架構師寫的,本身只能算是全程參與。學習
當時使用的是經典搭配:盤古分詞+Lucene.net。ui
前幾篇文章有說到,盤古分詞已經不少年不更新了,我在SupportYun系統一直引用的JieBaNet來作分詞技術。this
那麼是否也有成型的JieBaNet+Lucene.Net的全文搜索方案呢?spa
通過多番尋找,在GitHub上面找到一個簡易的例子:https://github.com/anderscui/jiebaForLuceneNet
博主下面要講的實現方案就是從這個demo獲得的啓發,你們有興趣能夠去看看這個demo。
博主使用的具體版本:Lucene.net 3.0.3.0 ,JieBaNet 0.38.3.0(作過簡易的調整與擴展,前面文章有講到)
首先咱們對Lucene.Net的分詞器Tokenizer、分析器Analyzer作一個基於JieBaNet的擴展。
1.基於LuceneNet擴展的JieBa分析器JiebaForLuceneAnalyzer
1 /// <summary> 2 /// 基於LuceneNet擴展的JieBa分析器 3 /// </summary> 4 public class JiebaForLuceneAnalyzer : Analyzer 5 { 6 protected static readonly ISet<string> DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 7 8 private static ISet<string> StopWords; 9 10 static JiebaForLuceneAnalyzer() 11 { 12 StopWords = new HashSet<string>(); 13 var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile); 14 if (File.Exists(stopWordsFile)) 15 { 16 var lines = File.ReadAllLines(stopWordsFile); 17 foreach (var line in lines) 18 { 19 StopWords.Add(line.Trim()); 20 } 21 } 22 else 23 { 24 StopWords = DefaultStopWords; 25 } 26 } 27 28 public override TokenStream TokenStream(string fieldName, TextReader reader) 29 { 30 var seg = new JiebaSegmenter(); 31 TokenStream result = new JiebaForLuceneTokenizer(seg, reader); 32 result = new LowerCaseFilter(result); 33 result = new StopFilter(true, result, StopWords); 34 return result; 35 } 36 }
2.基於LuceneNet擴展的JieBa分詞器:JiebaForLuceneTokenizer
1 /// <summary> 2 /// 基於Lucene的JieBa分詞擴展 3 /// </summary> 4 public class JiebaForLuceneTokenizer:Tokenizer 5 { 6 private readonly JiebaSegmenter segmenter; 7 private readonly ITermAttribute termAtt; 8 private readonly IOffsetAttribute offsetAtt; 9 private readonly ITypeAttribute typeAtt; 10 11 private readonly List<Token> tokens; 12 private int position = -1; 13 14 public JiebaForLuceneTokenizer(JiebaSegmenter seg, TextReader input):this(seg, input.ReadToEnd()) { } 15 16 public JiebaForLuceneTokenizer(JiebaSegmenter seg, string input) 17 { 18 segmenter = seg; 19 termAtt = AddAttribute<ITermAttribute>(); 20 offsetAtt = AddAttribute<IOffsetAttribute>(); 21 typeAtt = AddAttribute<ITypeAttribute>(); 22 23 var text = input; 24 tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); 25 } 26 27 public override bool IncrementToken() 28 { 29 ClearAttributes(); 30 position++; 31 if (position < tokens.Count) 32 { 33 var token = tokens[position]; 34 termAtt.SetTermBuffer(token.Word); 35 offsetAtt.SetOffset(token.StartIndex, token.EndIndex); 36 typeAtt.Type = "Jieba"; 37 return true; 38 } 39 40 End(); 41 return false; 42 } 43 44 public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search) 45 { 46 return segmenter.Tokenize(text, mode); 47 } 48 }
理想若是不向現實作一點點屈服,那麼理想也將歸於塵土。
實現方案設計:
咱們作全文搜索的設計時必定會考慮的一個問題就是:咱們系統是分不少模塊的,不一樣模塊的字段差別很大,怎麼才能實現同一個索引,既能夠單個模塊搜索又能夠全站搜索,甚至按一些字段作條件來搜索呢?
這些也是SupportYun系統須要考慮的問題,由於目前的數據就自然的拆分紅了活動、文章兩個類別,字段也大有不一樣。博主想實現的是一個能夠全站搜索(結果包括活動、文章),也能夠在文章欄目/活動欄目分別搜索,而且能夠按幾個指定字段來作搜索條件。
要作一個這樣的全文搜索功能,咱們須要從程序設計上來下功夫。下面就介紹一下博主的設計方案:
1、索引建立
1.咱們設計一個IndexManager來處理最基本的索引建立、更新、刪除操做。
1 public class IndexManager 2 { 3 /// <summary> 4 /// 索引存儲目錄 5 /// </summary> 6 public static readonly string IndexStorePath = ConfigurationManager.AppSettings["IndexStorePath"]; 7 private IndexWriter indexWriter; 8 private FSDirectory entityDirectory; 9 10 ~IndexManager() 11 { 12 if (entityDirectory != null) 13 { 14 entityDirectory.Dispose(); 15 } 16 if (indexWriter != null) 17 { 18 indexWriter.Dispose(); 19 } 20 } 21 22 /// <summary> 23 /// 對內容新增索引 24 /// </summary> 25 public void BuildIndex(List<IndexContent> indexContents) 26 { 27 try 28 { 29 if (entityDirectory == null) 30 { 31 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 32 } 33 if (indexWriter == null) 34 { 35 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 36 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 37 } 38 lock (IndexStorePath) 39 { 40 foreach (var indexContent in indexContents) 41 { 42 var doc = GetDocument(indexContent); 43 indexWriter.AddDocument(doc); 44 } 45 indexWriter.Commit(); 46 indexWriter.Optimize(); 47 indexWriter.Dispose(); 48 } 49 } 50 catch (Exception exception) 51 { 52 LogUtils.ErrorLog(exception); 53 } 54 finally 55 { 56 if (entityDirectory != null) 57 { 58 entityDirectory.Dispose(); 59 } 60 if (indexWriter != null) 61 { 62 indexWriter.Dispose(); 63 } 64 } 65 } 66 67 /// <summary> 68 /// 刪除索引 69 /// </summary> 70 /// <param name="moduleType"></param> 71 /// <param name="tableName">可空</param> 72 /// <param name="rowID"></param> 73 public void DeleteIndex(string moduleType, string tableName, string rowID) 74 { 75 try 76 { 77 if (entityDirectory == null) 78 { 79 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 80 } 81 if (indexWriter == null) 82 { 83 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 84 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 85 } 86 lock (IndexStorePath) 87 { 88 var query = new BooleanQuery 89 { 90 {new TermQuery(new Term("ModuleType", moduleType)), Occur.MUST}, 91 {new TermQuery(new Term("RowId", rowID)), Occur.MUST} 92 }; 93 if (!string.IsNullOrEmpty(tableName)) 94 { 95 query.Add(new TermQuery(new Term("TableName", tableName)), Occur.MUST); 96 } 97 98 indexWriter.DeleteDocuments(query); 99 indexWriter.Commit(); 100 indexWriter.Optimize(); 101 indexWriter.Dispose(); 102 } 103 } 104 catch (Exception exception) 105 { 106 LogUtils.ErrorLog(exception); 107 } 108 finally 109 { 110 if (entityDirectory != null) 111 { 112 entityDirectory.Dispose(); 113 } 114 if (indexWriter != null) 115 { 116 indexWriter.Dispose(); 117 } 118 } 119 } 120 121 /// <summary> 122 /// 更新索引 123 /// </summary> 124 /// <param name="indexContent"></param> 125 public void UpdateIndex(IndexContent indexContent) 126 { 127 try 128 { 129 if (entityDirectory == null) 130 { 131 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 132 } 133 if (indexWriter == null) 134 { 135 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 136 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 137 } 138 lock (IndexStorePath) 139 { 140 var query = new BooleanQuery 141 { 142 {new TermQuery(new Term("ModuleType", indexContent.ModuleType)), Occur.MUST}, 143 {new TermQuery(new Term("RowId", indexContent.RowId.ToString())), Occur.MUST} 144 }; 145 if (!string.IsNullOrEmpty(indexContent.TableName)) 146 { 147 query.Add(new TermQuery(new Term("TableName", indexContent.TableName)), Occur.MUST); 148 } 149 150 indexWriter.DeleteDocuments(query); 151 152 var document = GetDocument(indexContent); 153 indexWriter.AddDocument(document); 154 155 indexWriter.Commit(); 156 indexWriter.Optimize(); 157 indexWriter.Dispose(); 158 } 159 } 160 catch (Exception exception) 161 { 162 LogUtils.ErrorLog(exception); 163 } 164 finally 165 { 166 if (entityDirectory != null) 167 { 168 entityDirectory.Dispose(); 169 } 170 if (indexWriter != null) 171 { 172 indexWriter.Dispose(); 173 } 174 } 175 } 176 177 private Document GetDocument(IndexContent indexContent) 178 { 179 var doc = new Document(); 180 doc.Add(new Field("ModuleType", indexContent.ModuleType, Field.Store.YES, Field.Index.NOT_ANALYZED)); 181 doc.Add(new Field("TableName", indexContent.TableName, Field.Store.YES, Field.Index.NOT_ANALYZED)); 182 doc.Add(new Field("RowId", indexContent.RowId.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED)); 183 doc.Add(new Field("Title", indexContent.Title, Field.Store.YES, Field.Index.ANALYZED)); 184 doc.Add(new Field("IndexTextContent", ReplaceIndexSensitiveWords(indexContent.IndexTextContent), Field.Store.YES, Field.Index.ANALYZED)); 185 doc.Add(new Field("CollectTime", indexContent.CollectTime.ToString("yyyy-MM-dd HH:mm:ss"),Field.Store.YES, Field.Index.NO)); 186 187 // 預留 188 doc.Add(new Field("Tag1", indexContent.Tag1.Value, GetStoreEnum(indexContent.Tag1.Store) 189 , GetIndexEnum(indexContent.Tag1.Index))); 190 doc.Add(new Field("Tag2", indexContent.Tag2.Value, GetStoreEnum(indexContent.Tag2.Store) 191 , GetIndexEnum(indexContent.Tag2.Index))); 192 doc.Add(new Field("Tag3", indexContent.Tag3.Value, GetStoreEnum(indexContent.Tag3.Store) 193 , GetIndexEnum(indexContent.Tag3.Index))); 194 doc.Add(new Field("Tag4", indexContent.Tag4.Value, GetStoreEnum(indexContent.Tag4.Store) 195 , GetIndexEnum(indexContent.Tag4.Index))); 196 doc.Add(new Field("Tag5", indexContent.Tag5.Value, GetStoreEnum(indexContent.Tag5.Store) 197 , GetIndexEnum(indexContent.Tag5.Index))); 198 doc.Add(new Field("Tag6", indexContent.Tag6.Value, GetStoreEnum(indexContent.Tag6.Store) 199 , GetIndexEnum(indexContent.Tag6.Index))); 200 doc.Add(new Field("Tag7", indexContent.Tag7.Value, GetStoreEnum(indexContent.Tag7.Store) 201 , GetIndexEnum(indexContent.Tag7.Index))); 202 doc.Add(new Field("Tag8", indexContent.Tag8.Value, GetStoreEnum(indexContent.Tag8.Store) 203 , GetIndexEnum(indexContent.Tag8.Index))); 204 var field = new NumericField("FloatTag9", GetStoreEnum(indexContent.FloatTag9.Store), 205 indexContent.FloatTag9.Index != IndexEnum.NotIndex); 206 field = field.SetFloatValue(indexContent.FloatTag9.Value); 207 doc.Add(field); 208 field = new NumericField("FloatTag10", GetStoreEnum(indexContent.FloatTag10.Store), 209 indexContent.FloatTag10.Index != IndexEnum.NotIndex); 210 field = field.SetFloatValue(indexContent.FloatTag10.Value); 211 doc.Add(field); 212 return doc; 213 } 214 215 /// <summary> 216 /// 權益方法,臨時使用 217 /// 去除文本中非索引文本 218 /// </summary> 219 /// <param name="str"></param> 220 /// <returns></returns> 221 private string ReplaceIndexSensitiveWords(string str) 222 { 223 for (var i = 0; i < 3; i++) 224 { 225 str = str.Replace(" ", ""); 226 str = str.Replace(" ", "").Replace("\n", ""); 227 } 228 return str; 229 } 230 231 private Field.Index GetIndexEnum(IndexEnum index) 232 { 233 switch (index) 234 { 235 case IndexEnum.NotIndex: 236 return Field.Index.NO; 237 case IndexEnum.NotUseAnalyzerButIndex: 238 return Field.Index.NOT_ANALYZED; 239 case IndexEnum.UseAnalyzerIndex: 240 return Field.Index.ANALYZED; 241 default: 242 return Field.Index.NO; 243 } 244 } 245 246 private Field.Store GetStoreEnum(bool store) 247 { 248 return store ? Field.Store.YES : Field.Store.NO; 249 } 250 }
2.建立、更新使用到的標準數據類:IndexContent。
咱們設計TableName(對應DB表名)、RowId(對應DB主鍵)、CollectTime(對應DB數據建立時間)、ModuleType(所屬系統模塊)、Title(檢索標題)、IndexTextContent(檢索文本)等六個基礎字段,全部模塊須要建立索引必須構建該6個字段(你們可據具體狀況擴展)。
而後設計10個預留字段Tag1-Tag10,用以兼容各大模塊其餘不一樣字段。
預留字段的存儲、索引方式可獨立配置。
1 /// <summary> 2 /// 索引內容擴展類 3 /// 增長10個預留字段(8個文本型,2個數值型) 4 /// </summary> 5 public class IndexContent : BaseIndexContent 6 { 7 public IndexContent() 8 { 9 Tag1 = new IndexContentStringValue(); 10 Tag2 = new IndexContentStringValue(); 11 Tag3 = new IndexContentStringValue(); 12 Tag4 = new IndexContentStringValue(); 13 Tag5 = new IndexContentStringValue(); 14 Tag6 = new IndexContentStringValue(); 15 Tag7 = new IndexContentStringValue(); 16 Tag8 = new IndexContentStringValue(); 17 FloatTag9 = new IndexContentFloatValue(); 18 FloatTag10 = new IndexContentFloatValue(); 19 } 20 21 /// <summary> 22 /// 預留1 23 /// </summary> 24 public IndexContentStringValue Tag1 { get; set; } 25 26 /// <summary> 27 /// 預留2 28 /// </summary> 29 public IndexContentStringValue Tag2 { get; set; } 30 31 /// <summary> 32 /// 預留3 33 /// </summary> 34 public IndexContentStringValue Tag3 { get; set; } 35 36 /// <summary> 37 /// 預留4 38 /// </summary> 39 public IndexContentStringValue Tag4 { get; set; } 40 41 /// <summary> 42 /// 預留5 43 /// </summary> 44 public IndexContentStringValue Tag5 { get; set; } 45 46 /// <summary> 47 /// 預留6 48 /// </summary> 49 public IndexContentStringValue Tag6 { get; set; } 50 51 /// <summary> 52 /// 預留7 53 /// </summary> 54 public IndexContentStringValue Tag7 { get; set; } 55 56 /// <summary> 57 /// 預留8 58 /// </summary> 59 public IndexContentStringValue Tag8 { get; set; } 60 61 /// <summary> 62 /// 預留9(數值型) 63 /// </summary> 64 public IndexContentFloatValue FloatTag9 { get; set; } 65 66 /// <summary> 67 /// 預留10(數值型) 68 /// </summary> 69 public IndexContentFloatValue FloatTag10 { get; set; } 70 } 71 72 /// <summary> 73 /// 索引值及方式 74 /// </summary> 75 public class IndexContentStringValue 76 { 77 public IndexContentStringValue() 78 { 79 Value = ""; 80 Store = true; 81 Index = IndexEnum.NotIndex; 82 } 83 84 /// <summary> 85 /// 字符值 86 /// </summary> 87 public string Value { get; set; } 88 89 /// <summary> 90 /// 是否存儲 91 /// </summary> 92 public bool Store { get; set; } 93 94 /// <summary> 95 /// 索引&分詞方式 96 /// </summary> 97 public IndexEnum Index { get; set; } 98 } 99 100 /// <summary> 101 /// 索引值及方式 102 /// </summary> 103 public class IndexContentFloatValue 104 { 105 public IndexContentFloatValue() 106 { 107 Value = 0; 108 Store = true; 109 Index = IndexEnum.NotIndex; 110 } 111 112 /// <summary> 113 /// 字符值 114 /// </summary> 115 public float Value { get; set; } 116 117 /// <summary> 118 /// 是否存儲 119 /// </summary> 120 public bool Store { get; set; } 121 122 /// <summary> 123 /// 是否索引且分詞 124 /// </summary> 125 public IndexEnum Index { get; set; } 126 }
其中BaseIndexContent含有六個基礎字段。
3.建立一個子模塊索引構建器的接口:IIndexBuilder。
各子模塊經過繼承實現IIndexBuilder,來實現索引的操做。
1 /// <summary> 2 /// 各子模塊內容索引構建器接口 3 /// </summary> 4 public interface IIndexBuilder<TIndexContent> 5 { 6 /// <summary> 7 /// 將內容集合創建索引 8 /// </summary> 9 void BuildIndex(List<TIndexContent> indexContents); 10 11 /// <summary> 12 /// 刪除索引 13 /// </summary> 14 void DeleteIndex(string tableName, string rowID); 15 16 /// <summary> 17 /// 更新索引 18 /// </summary> 19 /// <param name="indexContents"></param> 20 void UpdateIndex(List<TIndexContent> indexContents); 21 }
4.下面咱們以活動模塊爲例,來實現索引建立。
a)首先建立一個基於活動模塊的數據類:ActivityIndexContent,能夠將咱們須要索引或存儲的字段都設計在內。
1 public class ActivityIndexContent 2 { 3 /// <summary> 4 /// 關聯表格名 5 /// </summary> 6 public string TableName { get; set; } 7 8 /// <summary> 9 /// 關聯表格行ID 10 /// </summary> 11 public Guid RowId { get; set; } 12 13 /// <summary> 14 /// 採集分析時間 15 /// </summary> 16 public DateTime CollectTime { get; set; } 17 18 public string Title { get; set; } 19 20 /// <summary> 21 /// 詳情 22 /// </summary> 23 public string InformationContent { get; set; } 24 25 /// <summary> 26 /// 活動類別 27 /// </summary> 28 public List<ActivityType> ActivityTypes { get; set; } 29 30 public Guid CityId { get; set; } 31 32 /// <summary> 33 /// 活動地址 34 /// </summary> 35 public string Address { get; set; } 36 37 /// <summary> 38 /// 活動日期 39 /// </summary> 40 public DateTime? ActivityDate { get; set; } 41 42 /// <summary> 43 /// 源連接 44 /// </summary> 45 public string Url { get; set; } 46 47 /// <summary> 48 /// 採集源名稱 49 /// </summary> 50 public string SourceName { get; set; } 51 52 /// <summary> 53 /// 採集源主站地址 54 /// </summary> 55 public string SourceUrl { get; set; } 56 57 /// <summary> 58 /// 採集源官方熱線 59 /// </summary> 60 public string SourceOfficialHotline { get; set; } 61 }
b)咱們再建立ActivityIndexBuilder並繼承IIndexBuilder,實現其建立、更新、刪除方法。
1 /// <summary> 2 /// 活動數據索引建立器 3 /// </summary> 4 public class ActivityIndexBuilder : IIndexBuilder<ActivityIndexContent> 5 { 6 public const string MODULETYPE = "活動"; 7 8 /// <summary> 9 /// 建立索引 10 /// </summary> 11 /// <param name="activityIndexContents"></param> 12 public void BuildIndex(List<ActivityIndexContent> activityIndexContents) 13 { 14 var indexManager = new IndexManager(); 15 var indexContents = activityIndexContents.Select(activityIndexContent => new IndexContent 16 { 17 ModuleType = MODULETYPE, 18 TableName = activityIndexContent.TableName, 19 RowId = activityIndexContent.RowId, 20 Title = activityIndexContent.Title, 21 IndexTextContent = activityIndexContent.InformationContent, 22 CollectTime = activityIndexContent.CollectTime, 23 Tag1 = new IndexContentStringValue 24 { 25 // 活動分類 26 Value = activityIndexContent.GetActivityTypeStr() 27 }, 28 Tag2 = new IndexContentStringValue 29 { 30 // 源連接 31 Value = activityIndexContent.Url 32 }, 33 Tag3 = new IndexContentStringValue 34 { 35 // 採集源名稱 36 Value = activityIndexContent.SourceName, 37 Index = IndexEnum.UseAnalyzerIndex 38 }, 39 Tag4 = new IndexContentStringValue 40 { 41 // 採集源官方熱線 42 Value = activityIndexContent.SourceOfficialHotline 43 }, 44 Tag5 = new IndexContentStringValue 45 { 46 // 採集源主站地址 47 Value = activityIndexContent.SourceUrl 48 }, 49 Tag6 = new IndexContentStringValue() 50 { 51 // 採集活動舉辦城市ID 52 Value = activityIndexContent.CityId.ToString().ToLower(), 53 Index = IndexEnum.NotUseAnalyzerButIndex 54 }, 55 Tag7 = new IndexContentStringValue() 56 { 57 // 採集活動舉辦地址 58 Value = string.IsNullOrEmpty(activityIndexContent.Address)?"":activityIndexContent.Address 59 }, 60 Tag8 = new IndexContentStringValue() 61 { 62 // 採集活動舉辦時間 63 Value = activityIndexContent.ActivityDate.HasValue?activityIndexContent.ActivityDate.Value.ToString("yyyy年MM月dd日"):"" 64 } 65 }).ToList(); 66 indexManager.BuildIndex(indexContents); 67 } 68 69 /// <summary> 70 /// 刪除索引 71 /// </summary> 72 /// <param name="tableName"></param> 73 /// <param name="rowID"></param> 74 public void DeleteIndex(string tableName, string rowID) 75 { 76 var indexManager = new IndexManager(); 77 indexManager.DeleteIndex(MODULETYPE, tableName, rowID); 78 } 79 80 /// <summary> 81 /// 更新索引 82 /// </summary> 83 /// <param name="indexContents"></param> 84 public void UpdateIndex(List<ActivityIndexContent> indexContents) 85 { 86 foreach (var indexContent in indexContents) 87 { 88 if (indexContent.RowId != Guid.Empty && 89 indexContent.TableName != null) 90 { 91 // 刪除索引 92 this.DeleteIndex(indexContent.TableName, 93 indexContent.RowId.ToString().ToLower()); 94 } 95 } 96 97 // 添加索引 98 this.BuildIndex(indexContents); 99 } 100 }
代碼就不解釋了,很簡單。主要就是調用IndexManager來執行操做。
咱們只須要在須要建立活動數據索引的業務點,構建ActivityIndexBuilder對象,並構建ActivityIndexContent集合做爲參數,調用BuildIndex方法便可。
2、全文搜索
全文搜索咱們採用一樣的設計方式。
1.設計一個抽象的搜索類:BaseIndexSearch,全部搜索模塊(包括全站)均需繼承它來實現搜索效果。
1 public abstract class BaseIndexSearch<TIndexSearchResultItem> 2 where TIndexSearchResultItem : IndexSearchResultItem 3 { 4 /// <summary> 5 /// 索引存儲目錄 6 /// </summary> 7 private static readonly string IndexStorePath = ConfigurationManager.AppSettings["IndexStorePath"]; 8 private readonly string[] fieldsToSearch; 9 protected static readonly SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>"); 10 private static IndexSearcher indexSearcher = null; 11 12 /// <summary> 13 /// 索引內容命中片斷大小 14 /// </summary> 15 public int FragmentSize { get; set; } 16 17 /// <summary> 18 /// 構造方法 19 /// </summary> 20 /// <param name="fieldsToSearch">搜索文本字段</param> 21 protected BaseIndexSearch(string[] fieldsToSearch) 22 { 23 FragmentSize = 100; 24 this.fieldsToSearch = fieldsToSearch; 25 } 26 27 /// <summary> 28 /// 建立搜索結果實例 29 /// </summary> 30 /// <returns></returns> 31 protected abstract TIndexSearchResultItem CreateIndexSearchResultItem(); 32 33 /// <summary> 34 /// 修改搜索結果(主要修改tag字段對應的屬性) 35 /// </summary> 36 /// <param name="indexSearchResultItem">搜索結果項實例</param> 37 /// <param name="content">用戶搜索內容</param> 38 /// <param name="docIndex">索引庫位置</param> 39 /// <param name="doc">當前位置內容</param> 40 /// <returns>搜索結果</returns> 41 protected abstract void ModifyIndexSearchResultItem(ref TIndexSearchResultItem indexSearchResultItem, string content, int docIndex, Document doc); 42 43 /// <summary> 44 /// 修改篩選器(各模塊) 45 /// </summary> 46 /// <param name="filter"></param> 47 protected abstract void ModifySearchFilter(ref Dictionary<string, string> filter); 48 49 /// <summary> 50 /// 全庫搜索 51 /// </summary> 52 /// <param name="content">搜索文本內容</param> 53 /// <param name="filter">查詢內容限制條件,默認爲null,不限制條件.</param> 54 /// <param name="fieldSorts">對字段進行排序</param> 55 /// <param name="pageIndex">查詢結果當前頁,默認爲1</param> 56 /// <param name="pageSize">查詢結果每頁結果數,默認爲20</param> 57 public PagedIndexSearchResult<TIndexSearchResultItem> Search(string content 58 , Dictionary<string, string> filter = null, List<FieldSort> fieldSorts = null 59 , int pageIndex = 1, int pageSize = 20) 60 { 61 try 62 { 63 if (!string.IsNullOrEmpty(content)) 64 { 65 content = ReplaceIndexSensitiveWords(content); 66 content = GetKeywordsSplitBySpace(content, 67 new JiebaForLuceneTokenizer(new JiebaSegmenter(), content)); 68 } 69 if (string.IsNullOrEmpty(content) || pageIndex < 1) 70 { 71 throw new Exception("輸入參數不符合要求(用戶輸入爲空,頁碼小於等於1)"); 72 } 73 74 var stopWatch = new Stopwatch(); 75 stopWatch.Start(); 76 77 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 78 // 索引條件建立 79 var query = MakeSearchQuery(content, analyzer); 80 // 篩選條件構建 81 filter = filter == null ? new Dictionary<string, string>() : new Dictionary<string, string>(filter); 82 ModifySearchFilter(ref filter); 83 Filter luceneFilter = MakeSearchFilter(filter); 84 85 #region------------------------------執行查詢--------------------------------------- 86 87 TopDocs topDocs; 88 if (indexSearcher == null) 89 { 90 var dir = new DirectoryInfo(IndexStorePath); 91 FSDirectory entityDirectory = FSDirectory.Open(dir); 92 IndexReader reader = IndexReader.Open(entityDirectory, true); 93 indexSearcher = new IndexSearcher(reader); 94 } 95 else 96 { 97 IndexReader indexReader = indexSearcher.IndexReader; 98 if (!indexReader.IsCurrent()) 99 { 100 indexSearcher.Dispose(); 101 indexSearcher = new IndexSearcher(indexReader.Reopen()); 102 } 103 } 104 // 收集器容量爲全部 105 int totalCollectCount = pageIndex*pageSize; 106 Sort sort = GetSortByFieldSorts(fieldSorts); 107 topDocs = indexSearcher.Search(query, luceneFilter, totalCollectCount, sort ?? Sort.RELEVANCE); 108 109 #endregion 110 111 #region-----------------------返回結果生成------------------------------- 112 113 ScoreDoc[] hits = topDocs.ScoreDocs; 114 var start = (pageIndex - 1)*pageSize + 1; 115 var end = Math.Min(totalCollectCount, hits.Count()); 116 117 var result = new PagedIndexSearchResult<TIndexSearchResultItem> 118 { 119 PageIndex = pageIndex, 120 PageSize = pageSize, 121 TotalRecords = topDocs.TotalHits 122 }; 123 124 for (var i = start; i <= end; i++) 125 { 126 var scoreDoc = hits[i - 1]; 127 var doc = indexSearcher.Doc(scoreDoc.Doc); 128 129 var indexSearchResultItem = CreateIndexSearchResultItem(); 130 indexSearchResultItem.DocIndex = scoreDoc.Doc; 131 indexSearchResultItem.ModuleType = doc.Get("ModuleType"); 132 indexSearchResultItem.TableName = doc.Get("TableName"); 133 indexSearchResultItem.RowId = Guid.Parse(doc.Get("RowId")); 134 if (!string.IsNullOrEmpty(doc.Get("CollectTime"))) 135 { 136 indexSearchResultItem.CollectTime = DateTime.Parse(doc.Get("CollectTime")); 137 } 138 var title = GetHighlighter(formatter, FragmentSize).GetBestFragment(content, doc.Get("Title")); 139 indexSearchResultItem.Title = string.IsNullOrEmpty(title) ? doc.Get("Title") : title; 140 var text = GetHighlighter(formatter, FragmentSize) 141 .GetBestFragment(content, doc.Get("IndexTextContent")); 142 indexSearchResultItem.Content = string.IsNullOrEmpty(text) 143 ? (doc.Get("IndexTextContent").Length > 100 144 ? doc.Get("IndexTextContent").Substring(0, 100) 145 : doc.Get("IndexTextContent")) 146 : text; 147 ModifyIndexSearchResultItem(ref indexSearchResultItem, content, scoreDoc.Doc, doc); 148 result.Add(indexSearchResultItem); 149 } 150 stopWatch.Stop(); 151 result.Elapsed = stopWatch.ElapsedMilliseconds*1.0/1000; 152 153 return result; 154 155 #endregion 156 } 157 catch (Exception exception) 158 { 159 LogUtils.ErrorLog(exception); 160 return null; 161 } 162 } 163 164 private Sort GetSortByFieldSorts(List<FieldSort> fieldSorts) 165 { 166 if (fieldSorts == null) 167 { 168 return null; 169 } 170 return new Sort(fieldSorts.Select(fieldSort => new SortField(fieldSort.FieldName, SortField.FLOAT, !fieldSort.Ascend)).ToArray()); 171 } 172 173 private static Filter MakeSearchFilter(Dictionary<string, string> filter) 174 { 175 Filter luceneFilter = null; 176 if (filter != null && filter.Keys.Any()) 177 { 178 var booleanQuery = new BooleanQuery(); 179 foreach (KeyValuePair<string, string> keyValuePair in filter) 180 { 181 var termQuery = new TermQuery(new Term(keyValuePair.Key, keyValuePair.Value)); 182 booleanQuery.Add(termQuery, Occur.MUST); 183 } 184 luceneFilter = new QueryWrapperFilter(booleanQuery); 185 } 186 return luceneFilter; 187 } 188 189 private Query MakeSearchQuery(string content, Analyzer analyzer) 190 { 191 var query = new BooleanQuery(); 192 // 總查詢參數 193 // 屬性查詢 194 if (!string.IsNullOrEmpty(content)) 195 { 196 QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_30, fieldsToSearch, analyzer); 197 Query queryObj; 198 try 199 { 200 queryObj = parser.Parse(content); 201 } 202 catch (ParseException parseException) 203 { 204 throw new Exception("在FileLibraryIndexSearch中構造Query時出錯。", parseException); 205 } 206 query.Add(queryObj, Occur.MUST); 207 } 208 return query; 209 } 210 211 private string GetKeywordsSplitBySpace(string keywords, JiebaForLuceneTokenizer jiebaForLuceneTokenizer) 212 { 213 var result = new StringBuilder(); 214 215 var words = jiebaForLuceneTokenizer.Tokenize(keywords); 216 217 foreach (var word in words) 218 { 219 if (string.IsNullOrWhiteSpace(word.Word)) 220 { 221 continue; 222 } 223 224 result.AppendFormat("{0} ", word.Word); 225 } 226 227 return result.ToString().Trim(); 228 } 229 230 private string ReplaceIndexSensitiveWords(string str) 231 { 232 str = str.Replace("+", ""); 233 str = str.Replace("+", ""); 234 str = str.Replace("-", ""); 235 str = str.Replace("-", ""); 236 str = str.Replace("!", ""); 237 str = str.Replace("!", ""); 238 str = str.Replace("(", ""); 239 str = str.Replace(")", ""); 240 str = str.Replace("(", ""); 241 str = str.Replace(")", ""); 242 str = str.Replace(":", ""); 243 str = str.Replace(":", ""); 244 str = str.Replace("^", ""); 245 str = str.Replace("[", ""); 246 str = str.Replace("]", ""); 247 str = str.Replace("【", ""); 248 str = str.Replace("】", ""); 249 str = str.Replace("{", ""); 250 str = str.Replace("}", ""); 251 str = str.Replace("{", ""); 252 str = str.Replace("}", ""); 253 str = str.Replace("~", ""); 254 str = str.Replace("~", ""); 255 str = str.Replace("*", ""); 256 str = str.Replace("*", ""); 257 str = str.Replace("?", ""); 258 str = str.Replace("?", ""); 259 return str; 260 } 261 262 protected Highlighter GetHighlighter(Formatter formatter, int fragmentSize) 263 { 264 var highlighter = new Highlighter(formatter, new Segment()) { FragmentSize = fragmentSize }; 265 return highlighter; 266 } 267 }
幾個protected abstract方法,是須要繼承的子類來實現的。
其中爲了實現搜索結果對命中關鍵詞進行高亮顯示,特引用了盤古分詞的Highlighter。原則是此處應該是參照盤古分詞的源碼,本身使用JieBaNet來作實現的,因爲工期較緊,直接引用了盤古。
2.咱們設計一個IndexSearchResultItem,表示搜索結果的基類。
1 /// <summary> 2 /// 全庫搜索結果單項內容 3 /// </summary> 4 public class IndexSearchResultItem 5 { 6 /// <summary> 7 /// 內容索引 8 /// </summary> 9 public int DocIndex { get; set; } 10 11 /// <summary> 12 /// 模塊類別 13 /// </summary> 14 public string ModuleType { get; set; } 15 16 /// <summary> 17 /// 表名 18 /// </summary> 19 public string TableName { get; set; } 20 21 /// <summary> 22 /// 行號 23 /// </summary> 24 public Guid RowId { get; set; } 25 26 /// <summary> 27 /// 文檔標題 28 /// </summary> 29 public string Title { get; set; } 30 31 /// <summary> 32 /// 文檔內容片斷 33 /// </summary> 34 public string Content { get; set; } 35 36 public DateTime? CollectTime { get; set; } 37 }
3.咱們來看看具體的實現,先來看全站搜索的SearchService
1 public class IndexSearch : BaseIndexSearch<IndexSearchResultItem> 2 { 3 public IndexSearch() 4 : base(new[] { "IndexTextContent", "Title" }) 5 { 6 } 7 8 protected override IndexSearchResultItem CreateIndexSearchResultItem() 9 { 10 return new IndexSearchResultItem(); 11 } 12 13 protected override void ModifyIndexSearchResultItem(ref IndexSearchResultItem indexSearchResultItem, string content, 14 int docIndex, Document doc) 15 { 16 //不作修改 17 } 18 19 protected override void ModifySearchFilter(ref Dictionary<string, string> filter) 20 { 21 //不作篩選條件修改 22 } 23 }
是否是很是簡單。因爲咱們此處搜索的是全站,結果展現直接用基類,取出基本字段便可。
4.再列舉一個活動的搜索實現。
a)咱們首先建立一個活動搜索結果類ActivityIndexSearchResultItem,繼承自結果基類IndexSearchResultItem
1 public class ActivityIndexSearchResultItem : IndexSearchResultItem 2 { 3 /// <summary> 4 /// 活動類別 5 /// </summary> 6 public string ActivityTypes { get; set; } 7 8 public Guid CityId { get; set; } 9 10 /// <summary> 11 /// 活動地址 12 /// </summary> 13 public string Address { get; set; } 14 15 /// <summary> 16 /// 活動日期 17 /// </summary> 18 public string ActivityDate { get; set; } 19 20 /// <summary> 21 /// 源連接 22 /// </summary> 23 public string Url { get; set; } 24 25 /// <summary> 26 /// 採集源名稱 27 /// </summary> 28 public string SourceName { get; set; } 29 30 /// <summary> 31 /// 採集源主站地址 32 /// </summary> 33 public string SourceUrl { get; set; } 34 35 /// <summary> 36 /// 採集源官方熱線 37 /// </summary> 38 public string SourceOfficialHotline { get; set; } 39 }
b)而後建立活動模塊的搜索服務:ActivityIndexSearch,一樣須要繼承BaseIndexSearch,這時候ActivityIndexSearch只須要相對全站搜索修改幾個參數便可。
1 public class ActivityIndexSearch: BaseIndexSearch<ActivityIndexSearchResultItem> 2 { 3 public ActivityIndexSearch() 4 : base(new[] { "IndexTextContent", "Title" }) 5 { 6 } 7 8 protected override ActivityIndexSearchResultItem CreateIndexSearchResultItem() 9 { 10 return new ActivityIndexSearchResultItem(); 11 } 12 13 protected override void ModifyIndexSearchResultItem(ref ActivityIndexSearchResultItem indexSearchResultItem, string content, 14 int docIndex, Document doc) 15 { 16 indexSearchResultItem.ActivityTypes = doc.Get("Tag1"); 17 indexSearchResultItem.Url = doc.Get("Tag2"); 18 indexSearchResultItem.SourceName = doc.Get("Tag3"); 19 indexSearchResultItem.SourceOfficialHotline = doc.Get("Tag4"); 20 indexSearchResultItem.SourceUrl = doc.Get("Tag5"); 21 indexSearchResultItem.CityId=new Guid(doc.Get("Tag6")); 22 indexSearchResultItem.Address = doc.Get("Tag7"); 23 indexSearchResultItem.ActivityDate = doc.Get("Tag8"); 24 } 25 26 protected override void ModifySearchFilter(ref Dictionary<string, string> filter) 27 { 28 filter.Add("ModuleType", "活動"); 29 } 30 }
篩選條件加上模塊=活動,返回結果數據類指定,活動特有字段返回賦值。
業務調用就很是簡單了。
全站全文搜索:咱們直接new IndexSearch(),而後調用其Search()方法
活動全文搜索:咱們直接new ActivityIndexSearch(),而後調用其Search()方法
Search()方法幾個參數:
///<param name="content">搜索文本內容</param>
/// <param name="filter">查詢內容限制條件,默認爲null,不限制條件.</param>
/// <param name="fieldSorts">對字段進行排序</param>
/// <param name="pageIndex">查詢結果當前頁,默認爲1</param>
/// <param name="pageSize">查詢結果每頁結果數,默認爲20</param>
若是咱們用軟能力而不是用技術能力來區分程序員的好壞 – 是否是有那麼點反常和變態。
不少思路均來源於13年那次作全文搜索,跟當時的架構師學習的。
在此表示感謝。
原創文章,代碼都是從本身項目裏貼出來的。轉載請註明出處哦,親~~~