Elasticsearch的相關度評分(relevance score)算法採用的是term frequency/inverse document frequency算法,簡稱爲TF/IDF算法。node
算法介紹:算法
GET /test_index/test_type/_search?explain { "query": { "match": { "test_field": "test hello" } } }
{ "took": 1, "timed_out": false, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 3, "max_score": 0.843298, "hits": [ { "_shard": "[test_index][2]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "6", "_score": 0.843298, "_source": { "test_field": "test test" }, "_explanation": { "value": 0.843298, "description": "sum of:", "details": [ { "value": 0.843298, "description": "sum of:", "details": [ { "value": 0.843298, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.843298, "description": "score(doc=0,freq=2.0 = termFreq=2.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 2, "description": "docFreq", "details": [] }, { "value": 4, "description": "docCount", "details": [] } ] }, { "value": 1.2166219, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 2, "description": "termFreq=2.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 1.75, "description": "avgFieldLength", "details": [] }, { "value": 2.56, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "_type:test_type, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } }, { "_shard": "[test_index][1]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "8", "_score": 0.43445712, "_source": { "test_field": "test client 2" }, "_explanation": { "value": 0.43445715, "description": "sum of:", "details": [ { "value": 0.43445715, "description": "sum of:", "details": [ { "value": 0.43445715, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.43445715, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.47000363, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 2, "description": "docFreq", "details": [] }, { "value": 3, "description": "docCount", "details": [] } ] }, { "value": 0.92436975, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 3.3333333, "description": "avgFieldLength", "details": [] }, { "value": 4, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "_type:test_type, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } }, { "_shard": "[test_index][3]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "7", "_score": 0.25316024, "_source": { "test_field": "test client 1" }, "_explanation": { "value": 0.25316024, "description": "sum of:", "details": [ { "value": 0.25316024, "description": "sum of:", "details": [ { "value": 0.25316024, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.25316024, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 0.88, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 3, "description": "avgFieldLength", "details": [] }, { "value": 4, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "*:*, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } } ] } }
在咱們搜索的時候,要依靠倒排索引,可是當咱們排序的時候,須要依靠正排索引。經過倒排索引鎖定文檔document以後,看到每一個document的每一個field,而後進行排序,所謂的正排索引就是doc values。緩存
對於ES而言,在創建索引的時候,一方面會創建倒排索引,以供搜索使用;一方面會創建正排索引,也就是doc values,以供排序,聚合,過濾等使用。ide
doc values是被保存在磁盤上的,此時若是內存足夠,OS操做系統會自動將其緩存在內存中,性能仍是會很高的,若是內存不夠用,OS操做系統會將其寫入磁盤。post
下面舉個例子描述正排索引和倒排索引
假設某個index有兩個doc性能
doc1 : hello world you and me
doc2 : hi world, how are you
創建倒排索引fetch
word doc1 doc2 hello * world * * you * * and * me * hi * how * are *
假設某個index有兩個docui
doc1: {"name": "jack", "age": 27} doc2: {"name": "tom", "age": 30}
創建正排索引spa
document name age doc1 jack 27 doc2 tom 30
基本原理:操作系統
基本原理:
也就是ES的query pharse是根據priority queue去構建搜索結果的
好比總共有60000條數據,三個primary shard,每一個shard上分了20000條數據,每頁是10條數據,這個時候,你要搜索到第1000頁,實際上要拿到的是10001~10010,也就是會構建一個10010大小的priority queue。
注意這裏千萬不要理解成每一個shard都是返回10條數據。這樣理解是錯誤的!
下面作一下詳細的分析:
請求首先多是打到一個不包含這個index的shard的node上去,這個node就是一個協調節點coordinate node,那麼這個coordinate node就會將搜索請求轉發到index的三個shard所在的node上去。好比說咱們以前說的狀況下,要搜索60000條數據中的第1000頁,實際上每一個shard都要將內部的20000條數據中的第10001~10010條數據,拿出來,不是才10條,是10010條數據。3個shard的每一個shard都返回10010條數據給協調節點coordinate node,coordinate node會收到總共30030條數據,此時會構建一個30030大小的priority queue,而後在這些數據中進行排序,根據_score相關度分數,而後取到10001~10010這10條數據,就是咱們要的第1000頁的10條數據。
以下圖所示:
想象一下有兩個文檔有一樣值的時間戳字段,搜索結果用 timestamp 字段來排序。 因爲搜索請求是在全部有效的分片副本間輪詢的,那就有可能發生主分片處理請求時,這兩個文檔是一種順序, 而副本分片處理請求時又是另外一種順序。
bouncing results 問題::每次用戶刷新頁面,搜索結果表現是不一樣的順序。 讓同一個用戶始終使用同一個分片,這樣能夠避免這種問題, 能夠設置 preference 參數爲一個特定的任意值好比用戶會話ID來解決。
偏好這個參數 preference 容許 用來控制由哪些分片或節點來處理搜索請求。 它接受像 _primary, _primary_first, _local, _only_node:xyz, _prefer_node:xyz, 和 _shards:2,3 這樣的值, 這些值在 search preference 文檔頁面被詳細解釋。
可是最有用的值是某些隨機字符串,它能夠避免 bouncing results 問題。
在實際應用中,經過from+size不可避免會出現深分頁的瓶頸,那麼經過scoll技術就是一個很好的解決深分頁的方法。好比若是咱們一次性要查出10萬條數據,那麼使用from+size很顯然性能會很是的差,priority queue會很是的大。此時若是採用scroll滾動查詢,就能夠一批一批的查,直到全部數據都查詢完。
scroll原理
scoll搜索會在第一次搜索的時候,保存一個當時的視圖快照,以後只會基於該舊的視圖快照提供數據搜索,若是這個期間數據變動,是不會讓用戶看到的。並且ES內部是基於_doc進行排序的方式,性能較高。
示例:
# 使用scroll POST /test_index/_search?scroll=1m { "query": { "match_all": {} }, "sort": [ "_doc" ], "size": 3 }
獲取到scroll_id
{ "_scroll_id": "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAI-sFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACPqxYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAj68WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAI-tFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACPrhYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3", "took": 3, "timed_out": false, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 12, "max_score": null, "hits": [ { "_index": "test_index", "_type": "test_type", "_id": "AWypxxLYFCl_S-ox4wvd", "_score": null, "_source": { "test_content": "my test" }, "sort": [ 0 ] }, { "_index": "test_index", "_type": "test_type", "_id": "6", "_score": null, "_source": { "test_field": "test test" }, "sort": [ 0 ] }, { "_index": "test_index", "_type": "test_type", "_id": "7", "_score": null, "_source": { "test_field": "test client 1" }, "sort": [ 0 ] } ] } }
滾動搜索
# 滾動搜索 POST _search/scroll { "scroll":"1m", "scroll_id":"DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAJDMFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQzRYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAkM8WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAJDOFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQ0BYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3" }
搜索結果
{ "_scroll_id": "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAJDMFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQzRYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAkM8WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAJDOFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQ0BYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3", "took": 1, "timed_out": false, "terminated_early": true, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 12, "max_score": null, "hits": [ { "_index": "test_index", "_type": "test_type", "_id": "11", "_score": null, "_source": { "num": 0, "tags": [] }, "sort": [ 0 ] }, { "_index": "test_index", "_type": "test_type", "_id": "8", "_score": null, "_source": { "test_field": "test client 2" }, "sort": [ 1 ] }, { "_index": "test_index", "_type": "test_type", "_id": "4", "_score": null, "_source": { "test_field": "test4" }, "sort": [ 1 ] } ] } }