Elasticsearch 內置的分詞器對中文不友好,會把中文分紅單個字來進行全文檢索,不能達到想要的結果git
IK Analysis for Elasticsearch:https://github.com/medcl/elasticsearch-analysis-ikgithub
ik 帶有兩個分詞器json
建立一個名叫 iktest 的索引,設置它的分析器用 ik ,分詞器用 ik_max_word,並建立一個 article 的類型,裏面有一個 subject 的字段,指定其使用 ik_max_word 分詞器安全
[root@k8s-0001 bin]# curl -H "Content-Type: application/json" -XPUT 'http://114.116.97.49:9200/iktest?pretty' -d '{ > "settings" : { > "analysis" : { > "analyzer" : { > "ik" : { > "tokenizer" : "ik_max_word" > } > } > } > }, > "mappings" : { > "article" : { > "dynamic" : true, > "properties" : { > "subject" : { > "type" : "text", > "analyzer" : "ik_max_word" > } > } > } > } > }' { "acknowledged" : true, "shards_acknowledged" : true, "index" : "iktest" }
批量添加幾條數據,這裏我指定元數據 _id 方便查看,subject 內容爲我隨便找的幾條新聞的標題網絡
[root@k8s-0001 bin]# curl -H "Content-Type: application/json" -XPOST http://114.116.97.49:9200/iktest/article/_bulk?pretty -d ' > { "index" : { "_id" : "1" } } > {"subject" : ""閨蜜"崔順實被韓檢方傳喚 韓總統府促徹查真相" } > { "index" : { "_id" : "2" } } > {"subject" : "韓舉行"護國訓練" 青瓦臺:決不準國家安全出問題" } > { "index" : { "_id" : "3" } } > {"subject" : "媒體稱FBI已經取得搜查令 檢視希拉里電郵" } > { "index" : { "_id" : "4" } } > {"subject" : "村上春樹獲安徒生獎 演講中談及歐洲排外問題" } > { "index" : { "_id" : "5" } } > {"subject" : "希拉里團隊炮轟FBI 參院民主黨領袖批其「違法」" } > ' { "took" : 10, "errors" : false, "items" : [ { "index" : { "_index" : "iktest", "_type" : "article", "_id" : "1", "_version" : 1, "result" : "created", "_shards" : { "total" : 2, "successful" : 1, "failed" : 0 }, "_seq_no" : 0, "_primary_term" : 1, "status" : 201 } }, { "index" : { "_index" : "iktest", "_type" : "article", "_id" : "2", "_version" : 1, "result" : "created", "_shards" : { "total" : 2, "successful" : 1, "failed" : 0 }, "_seq_no" : 0, "_primary_term" : 1, "status" : 201 } }, { "index" : { "_index" : "iktest", "_type" : "article", "_id" : "3", "_version" : 1, "result" : "created", "_shards" : { "total" : 2, "successful" : 1, "failed" : 0 }, "_seq_no" : 0, "_primary_term" : 1, "status" : 201 } }, { "index" : { "_index" : "iktest", "_type" : "article", "_id" : "4", "_version" : 1, "result" : "created", "_shards" : { "total" : 2, "successful" : 1, "failed" : 0 }, "_seq_no" : 1, "_primary_term" : 1, "status" : 201 } }, { "index" : { "_index" : "iktest", "_type" : "article", "_id" : "5", "_version" : 1, "result" : "created", "_shards" : { "total" : 2, "successful" : 1, "failed" : 0 }, "_seq_no" : 0, "_primary_term" : 1, "status" : 201 } } ] }
查詢 「希拉里和韓國」app
[root@k8s-0001 bin]# curl -H "Content-Type: application/json" -XPOST http://114.116.97.49:9200/iktest/article/_search?pretty -d' > { > "query" : { "match" : { "subject" : "希拉里和韓國" }}, > "highlight" : { > "pre_tags" : ["<font color='red'>"], > "post_tags" : ["</font>"], > "fields" : { > "subject" : {} > } > } > } > ' { "took" : 5, "timed_out" : false, "_shards" : { "total" : 5, "successful" : 5, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : 2, "max_score" : 0.2876821, "hits" : [ { "_index" : "iktest", "_type" : "article", "_id" : "5", "_score" : 0.2876821, "_source" : { "subject" : "希拉里團隊炮轟FBI 參院民主黨領袖批其「違法」" }, "highlight" : { "subject" : [ "<font color=red>希拉里</font>團隊炮轟FBI 參院民主黨領袖批其「違法」" ] } }, { "_index" : "iktest", "_type" : "article", "_id" : "3", "_score" : 0.2876821, "_source" : { "subject" : "媒體稱FBI已經取得搜查令 檢視希拉里電郵" }, "highlight" : { "subject" : [ "媒體稱FBI已經取得搜查令 檢視<font color=red>希拉里</font>電郵" ] } } ] } }
網絡詞語突飛猛進,如何讓新出的網絡熱詞(或特定的詞語)實時的更新到咱們的搜索當中呢
先用 ik 測試一下 :curl
Elasticsearch 中文分詞器 IK 配置和使用elasticsearch