elasticsearch學習筆記高級篇（六）——在案例中若是經過手動控制全文檢索結果的精準度

時間 2019-12-08

標籤 elasticsearch 學習筆記高級案例若是經過手動控制全文檢索結果精準欄目日誌分析简体版

原文原文鏈接

準備數據：

POST /forum/_bulk
{ "index": { "_id": 1 }}
{ "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden": false, "postDate": "2017-01-01" }
{ "index": { "_id": 2 }}
{ "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden": false, "postDate": "2017-01-02" }
{ "index": { "_id": 3 }}
{ "articleID" : "JODL-X-1937-#pV7", "userID" : 2, "hidden": false, "postDate": "2017-01-01" }
{ "index": { "_id": 4 }}
{ "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden": true, "postDate": "2017-01-02" }

一、爲帖子數據增長標題字段

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"title" : "this is java and elasticsearch blog"} }
{ "update": { "_id": "2"} }
{ "doc" : {"title" : "this is java blog"} }
{ "update": { "_id": "3"} }
{ "doc" : {"title" : "this is elasticsearch blog"} }
{ "update": { "_id": "4"} }
{ "doc" : {"title" : "this is java, elasticsearch, hadoop blog"} }
{ "update": { "_id": "5"} }
{ "doc" : {"title" : "this is spark blog"} }

二、搜索標題中包含java或elasticsearch的blog

這個就跟以前的那個term filter/query不同了。不是搜索exact value，而是進行full text全文搜索。
match query是負責進行全文檢索的。固然若是要檢索的field是not_analyzed類型的，那麼match query也至關於term queryjava

GET /forum/_search
{
  "query": {
    "match": {
      "title": "java elasticsearch"
    }
  }
}

{
  "took" : 1139,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 4,
      "relation" : "eq"
    },
    "max_score" : 0.97797304,
    "hits" : [
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.97797304,
        "_source" : {
          "articleID" : "XHDK-A-1293-#fJ3",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-01",
          "tag" : [
            "java",
            "hadoop"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 30,
          "title" : "this is java and elasticsearch blog"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 0.97797304,
        "_source" : {
          "articleID" : "QQPX-R-3956-#aD8",
          "userID" : 2,
          "hidden" : true,
          "postDate" : "2017-01-02",
          "tag" : [
            "java",
            "elasticsearch"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 80,
          "title" : "this is java, elasticsearch, hadoop blog"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.57843524,
        "_source" : {
          "articleID" : "KDKE-B-9947-#kL5",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-02",
          "tag" : [
            "java"
          ],
          "tag_cnt" : 1,
          "view_cnt" : 50,
          "title" : "this is java blog"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 0.57843524,
        "_source" : {
          "articleID" : "JODL-X-1937-#pV7",
          "userID" : 2,
          "hidden" : false,
          "postDate" : "2017-01-01",
          "tag" : [
            "hadoop"
          ],
          "tag_cnt" : 1,
          "view_cnt" : 100,
          "title" : "this is elasticsearch blog"
        }
      }
    ]
  }
}

三、搜索標題中包含java和elasticsearch的blog

搜索結果精確控制的第一步就是靈活使用and關鍵字，若是你是但願全部的搜索關鍵字都要匹配的，那麼就用and,能夠實現單純match query沒法實現的效果elasticsearch

GET /forum/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java elasticsearch",
        "operator": "and"
      }
    }
  }
}

{
  "took" : 6,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.97797304,
    "hits" : [
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.97797304,
        "_source" : {
          "articleID" : "XHDK-A-1293-#fJ3",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-01",
          "tag" : [
            "java",
            "hadoop"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 30,
          "title" : "this is java and elasticsearch blog"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 0.97797304,
        "_source" : {
          "articleID" : "QQPX-R-3956-#aD8",
          "userID" : 2,
          "hidden" : true,
          "postDate" : "2017-01-02",
          "tag" : [
            "java",
            "elasticsearch"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 80,
          "title" : "this is java, elasticsearch, hadoop blog"
        }
      }
    ]
  }
}

四、搜索包含java、elasticsearch、spark、hadoop，4個關鍵字中至少3個的blog

控制搜索結果的精確度的第二步就是指定一些關鍵字中，必須至少匹配其中的多少個關鍵字，才能做爲結果返回oop

GET /forum/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java elasticsearch spark hadoop",
        "minimum_should_match": 3
      }
    }
  }
}

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 2.2356422,
    "hits" : [
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 2.2356422,
        "_source" : {
          "articleID" : "QQPX-R-3956-#aD8",
          "userID" : 2,
          "hidden" : true,
          "postDate" : "2017-01-02",
          "tag" : [
            "java",
            "elasticsearch"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 80,
          "title" : "this is java, elasticsearch, hadoop blog"
        }
      }
    ]
  }
}

五、用bool組合多個搜索條件，來搜索title

GET /forum/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "title": "java"
          }
        }
      ],
      "must_not": [
        {
          "match": {
            "title": "spark"
          }
        }
      ],
      "should": [
        {
          "match": {
            "title": "hadoop"
          }
        },
        {
          "match": {
            "title": "elasticsearch"
          }
        }
      ]
    }
  }
}

{
  "took" : 12,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : 2.2356422,
    "hits" : [
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 2.2356422,
        "_source" : {
          "articleID" : "QQPX-R-3956-#aD8",
          "userID" : 2,
          "hidden" : true,
          "postDate" : "2017-01-02",
          "tag" : [
            "java",
            "elasticsearch"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 80,
          "title" : "this is java, elasticsearch, hadoop blog"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.97797304,
        "_source" : {
          "articleID" : "XHDK-A-1293-#fJ3",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-01",
          "tag" : [
            "java",
            "hadoop"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 30,
          "title" : "this is java and elasticsearch blog"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.57843524,
        "_source" : {
          "articleID" : "KDKE-B-9947-#kL5",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-02",
          "tag" : [
            "java"
          ],
          "tag_cnt" : 1,
          "view_cnt" : 50,
          "title" : "this is java blog"
        }
      }
    ]
  }
}

六、bool組合多個搜索條件，如何計算relevance score

must和should搜索對應的分數，加起來，除以must和should的總分數
因此排在第一位的是：包含java、hadoop、elasticsearch
排在第二位的是：包含java、elasticsearch
排在第三位的是：包含javapost

should是能夠影響相關度分數的this

must確保說誰必須有這個關鍵字，同時會根據這個must的條件去計算出document對這個搜索條件的relevance score。在知足must的基礎上，should中的條件，不匹配也是能夠的，可是若是匹配的更多，那麼document的relevance score就會更高。spa

七、should實現搜索四個關鍵字中至少包含三個關鍵字

默認狀況下，should是能夠不匹配任何一個的，可是有一個例外的狀況，就是若是沒有must的狀況下，那麼should中必須至少匹配一個才能夠code

GET /forum/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title": "java"
          }
        },
        {
          "match": {
            "title": "elasticsearch"
          }
        },
        {
          "match": {
            "title": "hadoop"
          }
        },
        {
          "match": {
            "title": "spark"
          }
        }
      ],
      "minimum_should_match": 3
    }
  }
}

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 2.2356422,
    "hits" : [
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 2.2356422,
        "_source" : {
          "articleID" : "QQPX-R-3956-#aD8",
          "userID" : 2,
          "hidden" : true,
          "postDate" : "2017-01-02",
          "tag" : [
            "java",
            "elasticsearch"
          ],
          "tag_cnt" : 2,
          "view_cnt" : 80,
          "title" : "this is java, elasticsearch, hadoop blog"
        }
      }
    ]
  }
}