Elasticsearch打造全文搜索引擎(二)

1、Es的文檔、索引的CURD操做

1. elasticsearch概念

  • 集羣:一個或多個節點組織在一塊兒
  • 節點:一個節點是集羣中的一個服務器,有一個名字來標識,默認是一個隨機的漫畫角色的名字
  • 分片:將索引劃分爲多份的能力,容許水平分割和擴展容量,多個分片相應請求,提升性能和吞吐量。
  • 副本:建立分片的一份或多份的能力,在一個節點失敗其他節點能夠頂上。
elasticsearch mysql
index(索引) 數據庫
type(類型)
document(文檔)
fields

2.經常使用屬性和類型

 

3.內置類型

4. CURD操做

  • 索引的初始化操做
  • 指定分片和副本的數量
  • shards一旦設置不能修改
# 索引初始化
PUT lagou { "settings": { "index": { "number_of_shards": 5, # 分片 "number_of_replicas": 1 # 備份 } } } GET lagou/_settings GET _all/_settings GET .kibana,lagou/_settings GET _settings # 修改settings PUT lagou/_settings { "number_of_replicas": 2 } # 獲取索引信息 GET _all GET lagou # 新建/保存文檔 # 方式一 PUT lagou/job/1 { "title": "python爬蟲分佈式開發", "salary_min":15000, "city":"北京", "company":{ "name":"百度", "company_addr":"北京市軟件園" }, "publish_date":"2019-06-15", "comments":15 } # 新建文檔 # 方式二 POST lagou/job/ { "title": "python django 開發工程師", "salary_min":30000, "city":"上海", "company":{ "name":"美團科技", "company_addr":"北京市軟件園A區" }, "publish_date":"2019-06-15", "comments":120 } GET lagou/job/1 GET lagou/job/1?_source=title GET lagou/job/1?_source=title,city GET lagou/job/1?_source # 修改文章 # 方式一 PUT lagou/job/1 { "title": "python爬蟲分佈式開發", "salary_min":18000, "city":"廣州", "company":{ "name":"百度", "company_addr":"北京市軟件園" }, "publish_date":"2019-06-15", "comments":15 } # 方式二:修改修改某一字段 POST lagou/job/1/_update { "doc": { "comments":20 } } # 刪除 DELETE lagou/job/1 DELETE lagou/job DELETE lagou

2、mget和bulk操做

# 批量操做

數據準備
POST lagou/job1/1
{
  "title": "python django 開發工程師",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美團科技",
    "company_addr":"北京市軟件園A區"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job1/2
{
  "title": "python django 開發工程師",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美團科技",
    "company_addr":"北京市軟件園A區"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job2/1
{
  "title": "python django 開發工程師",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美團科技",
    "company_addr":"北京市軟件園A區"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job2/2
{
  "title": "python django 開發工程師",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美團科技",
    "company_addr":"北京市軟件園A區"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

mget批量獲取
GET _mget
{
  "docs":[
      {"_index":"lagou",
       "_type":"job1",
       "_id":1
      },
      {"_index":"lagou",
       "_type":"job2",
       "_id":2
      }
    ]
}

GET lagou/_mget
{
  "docs":[
      {
       "_type":"job1",
       "_id":1
      },
      {
       "_type":"job2",
       "_id":2
      }
    ]
}

GET lagou/job1/_mget
{
  "docs":[
      {
       "_id":1
      },
      {
       "_id":2
      }
    ]
}

GET lagou/job1/_mget
{
  "ids":[1,2]
}

bulk增刪改查

POST _bulk
{"index":{"_index":"lagou","_type":"job1","_id":"3"}}
{"title": "python django 開發工程師","salary_min":30000,"city":"上海","company":{"name":"美團科技","company_addr":"北京市軟件園A區"},"publish_date":"2019-06-15","comments":120}
{"index":{"_index":"lagou","_type":"job2","_id":"3"}}
{"title": "python django 開發工程師","salary_min":30000,"city":"上海","company":{"name":"美團科技","company_addr":"北京市軟件園A區"},"publish_date":"2019-06-15","comments":120}

POST _bulk
{"create":{"_index":"lagou","_type":"job1","_id":"3"}}
{"title": "python django 開發工程師","salary_min":30000,"city":"上海","company":{"name":"美團科技","company_addr":"北京市軟件園A區"},"publish_date":"2019-06-15","comments":120}

POST _bulk
{"delete":{"_index":"lagou","_type":"job1","_id":"3"}}

POST _bulk
{"update":{"_index":"lagou","_type":"job1","_id":"3"}}
{"doc":{"title": "python django 開發工程師","salary_min":30000,"city":"上海","company":{"name":"美團科技","company_addr":"北京市軟件園A區"},"publish_date":"2019-06-15","comments":120}}

3、mapping映射和查詢

1. mapping映射

2.倒排索引

3. 倒排索引待解決的問題

4. 查詢

5. 操做

# mapping操做

PUT lagou1
{
  "mappings":{
    "job":{
      "properties":{
        "title":{
          "type":"text"
        },
        "salary_min":{
          "type":"integer"
        },
        "city":{
          "type":"keyword"
        },
        "company":{
          "properties":{
            "name":{
              "type":"text"
            },
            "company_addr":{
              "type":"text"
            },
            "employee_count":{
              "type":"integer"
            }
        }
      },
      "publish_date":{
        "type":"date",
        "format":"yyyy-MM-dd"
      },
      "comments":{
        "type":"integer"
      }
    }
  }
}
}

PUT lagou1/job/1
{
  "title": "python爬蟲分佈式開發",
  "salary_min":15000,
  "city":"北京",
  "company":{
    "name":"百度",
    "company_addr":"北京市軟件園",
    "employee_count":50
  },
  "publish_date":"2019-06-15",
  "comments":15
}

# get index mapping

GET lagou1/_mapping
GET lagou1/_mapping/job
GET _all/_mapping/job

# 查詢

PUT lagou2
{
  "mappings": {
    "job":{
      "properties":{
        "title":{
          "type": "text",
          "store":true,
          "analyzer": "ik_max_word"
        },
        "company_name": {
          "type": "keyword",
          "store":true
        },
        "desc":{
          "type":"text"
        }, 
        "add_time":{
          "type":"date",
          "format":"yyyy-MM-dd"
        },
        "comments":{
          "type": "integer"
        }
      }
    }
  }
}


POST lagou2/job
{
  "title":"python django 開發工程師" ,
  "company_name":"美國科技有限公司",
  "desc":"對django的概念熟悉,熟悉python基礎知識", 
  "comments":20,
  "add_time":"2017-04-01"  
}

POST lagou2/job
{
  "title":"python scrapy redis 分佈式爬蟲基本" ,
  "company_name":"百度科技有限公司",
  "desc":"對scrapy的概念熟悉,熟悉redis的基本操做",
  "comments":5,
  "add_time":"2017-04-15"  
} 

POST lagou2/job
{
  "title":"Elasticsearch打造搜索引擎" ,
  "company_name":"阿里巴巴科技有限公司",
  "desc":"熟悉數據結構算法,熟悉python的基本開發",
  "comments":15,
  "add_time":"2017-06-20"  
} 

POST lagou2/job
{
  "title":"python打造推薦引擎系統" ,
  "company_name":"阿里巴巴科技有限公司",
  "desc":"熟悉推薦引擎的原理以及算法、掌握C語言",
  "comments":60,
  "add_time":"2016-10-20"  
} 

# 簡單查詢
#查看分析器解析的結果
GET _analyze
{
  "analyzer": "ik_smart",
  "text":"Python網絡開發師"
}
GET _analyze
{
  "analyzer": "ik_max_word",
  "text":"Python網絡開發師"
}

#match查詢 (分詞查詢) python 和分佈式
#查詢第0-2條的title和company_name字段(desc字段的stored屬性不是true),並按comments排序
GET lagou2/_search
{
 "stored_fields":["title","company_name","desc"], 
  "query":{
    "match":{
      "title":"python分佈式"  
    }
  },
  "from": 0,
  "size": 2,
  "sort": [
    {
      "comments": {
        "order": "desc"
      }
    }
  ]
}

#查詢comments在大於等於十、小於等於20、權重2.0的數據
GET lagou2/_search
{
  "query":{  
    "range": {
      "comments": {
        "gte": 10,
        "lte": 20,
        "boost":2.0
      }
    }
  }
}
GET lagou2/_search
{
  "query":{  
    "range": {
      "add_time": {
        "gte": "2017-04-01",
        "lte": "now"
      }
    }
  }
}

#term查詢(不會作處理、直接查,相似於keyword屬性)
GET lagou2/_search
{
  "query":{
    "term":{
      "title":"python"  
    }
  }
}
#terms 和用match查django分佈工程  效果同樣
GET lagou2/_search
{
  "query":{
    "terms":{
      "title":["django"  ,"分佈"  ,"工程"  ]
    }
  }
}

#match_all
GET lagou2/_search
{
  "query":{
    "match_all":{}
  }
}
 
#match_phrase 
#短語查詢
#知足全部詞 既有python也有系統,倆個詞最小間距6位
GET lagou2/_search
{
  "query":{
    "match_phrase": {
      "title": {
        "query": "python系統",
        "slop":6
      }
    }
  }
}

#multi_match 多字段匹配,title的權重高於desc的3倍
GET lagou2/_search
{
  "query":{
    "multi_match": { 
      "query": "python系統",
      "fields":["title^3","desc"]
    }
  }
}

# sort查詢
GET lagou2/_search
{
  "query": {
    "match_all": {}
  },
  "sort": [
    {
      "comments": {
        "order": "asc"
      }
    }
  ]
}

# range範圍查詢
GET lagou2/_search
{
  "query": { 
      "range": {
      "comments": {
        "gte": 20,
        "lte": 60,
        "boost":2.0
      }
    }
  }
}

GET lagou2/_search
{
  "query": { 
      "range": {
      "add_time": {
        "gte": "2017-06-07",
        "lte": "now"
      }
  }
}
}

#wildcard 通配符查詢
GET lagou2/_search
{
  "query":{  
    "wildcard": {
      "title": {
        "value": "pyth*n",
        "boost": 2
      }
    }
  }
}

# 組合查詢
#bool 查詢
#用 bool 包括 must should must_not filter來完成
#格式以下
#bool:{
#  "filter":[], #不參與打分
#  "must":[],  #至關於        (salary=20 and title=Python)
#  "should":[], #至關於       (salary=20 or title=Python)
#  "must_not":[], #至關於not
#}

#創建測試數據
POST lagou/testjob/_bulk
{"index":{"_id":1}}
{"salary":10,"title":"Python"}
{"index":{"_id":2}}
{"salary":20,"title":"Scrapy"}
{"index":{"_id":3}}
{"salary":30,"title":"Django"}
{"index":{"_id":4}}
{"salary":30,"title":"Elasticsearch"}

DELETE lagou/testjob

#簡單的過濾查詢
#最簡單的fileter查詢
#select * from testjob where salary=20
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "must": {
        "match":{
          "salary":20
        }
      }, 
      "filter":{ 
        "match":{
          "title":"Scrapy"
        }
      }
    }
  }
}
#select * from testjob
#where (salary=20 or title=Python) and salary!=30 and salary!=10
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "should":[
          {"term":{"salary":20}},
          {"term":{"title":"python"}}
        ],
      "must_not": [
        {"term": {"salary": "30"}},
        {"term": {"salary": "10"}}
      ] 
    }
  }
}

#where (salary=30 and title="django") or title="python"
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "should":[
          {"term":{"title":"python"}},
          {"bool": { 
            "must":[
                {"term":{"salary":30}},
                {"term":{"title":"django"}}
              ] 
          }}
        ] 
    }
  }
}

#測試數據
POST lagou/testjob2/_bulk
{"index":{"_id":1}}
{"tags":["search"]}
{"index":{"_id":2}}
{"tags":["search","python"]}
{"index":{"_id":3}}
{"other_filed":["some data"]}
{"index":{"_id":4}}
{"tags":null}
{"index":{"_id":5}}
{"tags":["search",null]}

#處理null空值的方法
#select tags from testjob2 where tags is not null
GET lagou/testjob2/_search
{
  "query": {
    "bool": {
      "filter": {
        "exists": {
          "field": "tags"
        }
      }
    }
  }
}
#select tags from testjob2 where tags is null
GET lagou/testjob2/_search
{
  "query": {
    "bool": {
      "must_not": {
        "exists": {
          "field": "tags"
        }
      }
    }
  }
} 

 

gitee地址https://gitee.com/zhangyafeii/ArticleSpider_LcvSearchpython

相關文章
相關標籤/搜索