Elasticsearch源碼分析十三--高亮顯示highlight

  • 簡介
  • 查詢語法
  • 源碼分析

簡介

高亮顯示是在結果文檔中顯示查詢中的哪一個或哪些單詞被匹配的過程。
Elasticsearch底層使用Apache Lucene。 Lucene提供了三種類型的高亮實現:
第一種是標準類型(本文例子);第二種叫FastVectorHighlighter,
它須要詞向量和位置才能工做;第三種叫PostingsHighlighter。
Elasticsearch自動選擇正確的高亮實現方式:若是字段的配置中,
term_vector屬性設成了with_positions_offsets,則將使用FastVectorHighlighter。
使用詞向量將致使索引變大,但高亮顯示的執行須要更少的時間。此外,
對於存儲了大量數據的字段來講,推薦使用FastVectorHighlighterjava

查詢語法

例如:高亮顯示在title字段中匹配的單詞,注意highlight部分和query部分位於JSON中的同一層,
也能夠看作第一層。在Elasticsearch代碼中,位於JSON第一層的query、highlight等叫作Element。web

{
    "query" : {
       "term" : {
            "title" : "crime"
        }
    },
    "highlight" : {
        "pre_tags" : [ "<b>" ],
        "post_tags" : [ "</b>" ],
        "fields" : {
            "title" : {}
        }
    }
}

該查詢的結果以下,結果中除標準返回信息外,還有一個highlight部分,
該部分使用<b>這個HTML標籤來包含高亮部分,高亮由pre_tags和post_tags屬性指定,
默認使用<em>標籤。ide

{
    "took" : 2,
    "timed_out" : false,
    "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
},
"hits" : {
    "total" : 1,
    "max_score" : 0.19178301,
    "hits" : [ {
        "_index" : "library",
        "_type" : "book",
        "_id" : "4",
        "_score" : 0.19178301, 
            { 
                "title": "Crime and Punishment",
                "characters": ["Raskolnikov"],
                "tags": [],
                "copies": 0, "available" : true},
                "highlight" : {
                    "title" : [ "**<b>Crime</b>** and Punishment" ]
            }} ]    }}

源碼分析

'''(1)Elasticsearch code:註冊fetchPhase中元素的解析方法'''
public class SearchService extends AbstractLifecycleComponent<SearchService> {

    private final ImmutableMap<String, SearchParseElement> elementParsers;

    public SearchService(Settings settings, ClusterService clusterService, IndicesService indicesService, IndicesLifecycle indicesLifecycle, IndicesWarmer indicesWarmer, ThreadPool threadPool,
                         ScriptService scriptService, CacheRecycler cacheRecycler, DfsPhase dfsPhase, QueryPhase queryPhase, FetchPhase fetchPhase) {
        super(settings);
        this.threadPool = threadPool;
        '''省略....'''
        '''在此註冊全部元素的解析方法'''
        Map<String, SearchParseElement> elementParsers = new HashMap<String, SearchParseElement>();
        elementParsers.putAll(dfsPhase.parseElements());
        elementParsers.putAll(queryPhase.parseElements());
        elementParsers.putAll(fetchPhase.parseElements());
        elementParsers.put("stats", new StatsGroupsParseElement());
        this.elementParsers = ImmutableMap.copyOf(elementParsers);
        indicesLifecycle.addListener(indicesLifecycleListener);

        this.keepAliveReaper = threadPool.scheduleWithFixedDelay(new Reaper(), keepAliveInterval);

        this.indicesWarmer.addListener(new SearchWarmer());
    }

}
'''(2)Elasticsearch code:在FetchPhase中註冊highlight的解析方法'''
public class FetchPhase implements SearchPhase {

    private final FetchSubPhase[] fetchSubPhases;

    @Inject
    '''HighlightPhase高亮顯示'''
    public FetchPhase(HighlightPhase highlightPhase, ScriptFieldsFetchSubPhase scriptFieldsPhase, PartialFieldsFetchSubPhase partialFieldsPhase,
                      MatchedFiltersFetchSubPhase matchFiltersPhase, ExplainFetchSubPhase explainPhase, VersionFetchSubPhase versionPhase) {
        this.fetchSubPhases = new FetchSubPhase[]{scriptFieldsPhase, partialFieldsPhase, matchFiltersPhase, explainPhase, highlightPhase, versionPhase};
    }

    @Override
    public Map<String, ? extends SearchParseElement> parseElements() {
        ImmutableMap.Builder<String, SearchParseElement> parseElements = ImmutableMap.builder();
        parseElements.put("fields", new FieldsParseElement());
        for (FetchSubPhase fetchSubPhase : fetchSubPhases) {
            parseElements.putAll(fetchSubPhase.parseElements());
        }
        return parseElements.build();
    }
}
'''(3)Elasticsearch code:在FetchPhase中註冊highlight的解析實例HighlighterParseElement'''
public class HighlightPhase extends AbstractComponent implements FetchSubPhase {

    @Override
    public Map<String, ? extends SearchParseElement> parseElements() {
        return ImmutableMap.of("highlight", new HighlighterParseElement());
    }
} 

'''(4)Elasticsearch code:在FetchPhase中註冊highlight的解析實例HighlighterParseElement'''
/** * <pre> * highlight : { * tags_schema : "styled", * pre_tags : ["tag1", "tag2"], * post_tags : ["tag1", "tag2"], * order : "score", * highlight_filter : true, * fields : { * field1 : { }, * field2 : { fragment_size : 100, number_of_fragments : 2 }, * field3 : { number_of_fragments : 5, order : "simple", tags_schema : "styled" }, * field4 : { number_of_fragments: 0, pre_tags : ["openingTagA", "openingTagB"], post_tags : ["closingTag"] } * } * } * </pre> */
public class HighlighterParseElement implements SearchParseElement {

        '''默認高亮顯示的HTML標籤'''
    private static final String[] DEFAULT_PRE_TAGS = new String[]{"<em>"};
    private static final String[] DEFAULT_POST_TAGS = new String[]{"</em>"};

    private static final String[] STYLED_PRE_TAG = {
            "<em class=\"hlt1\">", "<em class=\"hlt2\">", "<em class=\"hlt3\">",
            "<em class=\"hlt4\">", "<em class=\"hlt5\">", "<em class=\"hlt6\">",
            "<em class=\"hlt7\">", "<em class=\"hlt8\">", "<em class=\"hlt9\">",
            "<em class=\"hlt10\">"
    };
    private static final String[] STYLED_POST_TAGS = {"</em>"};

    @Override
    public void parse(XContentParser parser, SearchContext context) throws Exception {
        XContentParser.Token token;
        String topLevelFieldName = null;
        List<SearchContextHighlight.Field> fields = newArrayList();

        String[] globalPreTags = DEFAULT_PRE_TAGS;
        String[] globalPostTags = DEFAULT_POST_TAGS;
        ......
        String globalHighlighterType = null;
        String globalFragmenter = null;
        Map<String, Object> globalOptions = null;

        '''此處的parser是JsonXContentParser實例'''
        while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
            if (token == XContentParser.Token.FIELD_NAME) {
                topLevelFieldName = parser.currentName();
            } else if (token == XContentParser.Token.START_ARRAY) {
                if ("pre_tags".equals(topLevelFieldName) || "preTags".equals(topLevelFieldName)) {
                    List<String> preTagsList = Lists.newArrayList();
                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
                        preTagsList.add(parser.text());
                    }
                    globalPreTags = preTagsList.toArray(new String[preTagsList.size()]);
                } else if ("post_tags".equals(topLevelFieldName) || "postTags".equals(topLevelFieldName)) {
                    List<String> postTagsList = Lists.newArrayList();
                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
                        postTagsList.add(parser.text());
                    }
                    globalPostTags = postTagsList.toArray(new String[postTagsList.size()]);
                }
            } else if (token.isValue()) {
                if ("order".equals(topLevelFieldName)) {
                    globalScoreOrdered = "score".equals(parser.text());
                } else if ("tags_schema".equals(topLevelFieldName) || "tagsSchema".equals(topLevelFieldName)) {
                    String schema = parser.text();
                    if ("styled".equals(schema)) {
                        globalPreTags = STYLED_PRE_TAG;
                        globalPostTags = STYLED_POST_TAGS;
                    }
          '''省略.....'''