基於elasticsearch 7.1 和python 3.6的簡易檢索系統實現

環境配置

elasticsearch 7.1的安裝

詳見:官網:elasticsearch 下載及安裝教程python

python 3.6及elasticsearch包的安裝

  1. python的安裝詳見:python 官網下載地址
  2. 確認python安裝無誤後,使用pip install elasticsearch安裝,注意版本的對應(詳見elasticsearch python手冊
  3. ik分詞器的安裝:從elasticsearch-analysis-ik Github下載地址中下載對應版本的編譯後安裝包(本身編譯太麻煩),解壓後重命名爲analysis-ik,並將其複製到elasticsearch-7.1.1\plugins目錄下

索引建立

使用Elasticsearch.indices.create()建立名爲的indexing_test索引git

from elasticsearch import Elasticsearch

es = Elasticsearch()
index = 'indexing_test'

# 自定義創建映射結構文件,很重要
mappings = {
        "settings" : {
            "index" : {
                "number_of_shards" : 5, 
                "number_of_replicas" : 0 
            },
            "analysis" : {
                "analyzer" : {
                    "ik" : {
                        "tokenizer" : "ik_max_word"
                    }
                }
            },
        },
        "mappings":{
            "properties":{
                "sub" : {
                    "type" : "text",
                    "index" : "analyzed",
                    "analyzer" : "ik_max_word",
                    "include_in_all" : "false"
                },
                "verb" : {
                    "type" : "text",
                    "index" : "analyzed",
                    "analyzer" : "ik_max_word",
                    "include_in_all" : "false"
                },
                "obj" : {
                    "type" : "text",
                    "index" : "analyzed",
                    "analyzer" : "ik_max_word",
                    "include_in_all" : "false"
                }
            }
        }
    }

# 建立名爲indexing_test索引
es.indices.create(index=index, ignore=[400, 404], body=mappings)
複製代碼

數據上傳

本文用的數據爲csv格式。使用helpers.bulk()批量上傳數據github

from elasticsearch import Elasticsearch
from elasticsearch import helpers
import csv

es = Elasticsearch()

# 讀取csv文件
csv_reader = csv.reader(open('data/標引.csv', encoding='utf-8'))

action = ({
    "_index": index,
    "_source": {
        "標題":row[0],"摘要":row[1],"關鍵詞":row[2],"標引詞":row[3]
    }} for row in csv_reader)

# 使用bulk批量導入數據
helpers.bulk(es, action, index = "indexing_new", raise_on_error=True)
複製代碼

檢索實現

在特定字段中匹配輸入的詞並返回檢索結果。app

from elasticsearch import Elasticsearch
from elasticsearch import helpers
import sys
 

def get_results(word):
    results = []
    es = Elasticsearch()
    index = 'indexing_test'
    query = {
            "size":100,
            "query":{
                "multi_match":{
                    "query":word
                    , "fields": ["標題","摘要","關鍵詞"]
                }
                
            },
            "highlight" : {
                "fields" : {
                    "標題": {},
                    "摘要": {},
                    "關鍵詞": {}
                }
            }

        }
    query1 = {
            "size":100,
            "query":{
                "match":{
                    "標引詞":word
                }
            },
            "highlight" : {
                "fields" : {
                    "標引詞": {}
                }
            }
        }
    res_left = es.search(index="indexing_test",body=query, size=30)
    res_right = es.search(index="indexing_test",body=query1, size=30)
    return res_left, res_right
    
    
if __name__ == "__main__":
    main()
複製代碼

檢索界面

界面如圖所示: elasticsearch

檢索界面
代碼:

from PyQt5 import QtCore,QtGui,QtWidgets
import sys
from query_scroll_scan import get_results

class MainUi(QtWidgets.QMainWindow):
    def __init__(self):
        super().__init__()
        self.init_ui()

    def init_ui(self):
        self.resize(960,700)
        self.setWindowTitle('檢索測試')
        self.main_widget = QtWidgets.QWidget()  # 建立窗口主部件
        self.main_layout = QtWidgets.QGridLayout()  # 建立主部件的網格佈局
        self.main_widget.setLayout(self.main_layout)  # 設置窗口主部件佈局爲網格佈局
        
        self.right_bar_widget_search_input = QtWidgets.QLineEdit()
        self.right_bar_widget_search_input.setPlaceholderText("輸入關鍵詞,點擊按鈕/回車進行搜索")
        self.search_button = QtWidgets.QPushButton("搜索")
        # self.search_button.setCheckable(True)
        self.search_button.clicked.connect(self.get_words)  # 爲按鈕添加點擊事件
        self.right_bar_widget_search_input.returnPressed.connect(self.get_words)
        
        self.up_widget = QtWidgets.QWidget()  # 建立頂部部件
        self.up_widget.setObjectName('up_widget')
        self.up_layout = QtWidgets.QGridLayout()  # 建立頂部部件的網格佈局層
        self.up_widget.setLayout(self.up_layout)
        
        self.up_layout.addWidget(self.right_bar_widget_search_input, 0, 0)
        self.up_layout.addWidget(self.search_button, 0, 1)

        self.left_label = QtWidgets.QLabel("全文檢索結果")
        self.right_label = QtWidgets.QLabel("主題標引後檢索結果")
        self.up_layout.addWidget(self.left_label,1,0)
        self.up_layout.addWidget(self.right_label,1,1)
     

        self.left_text = QtWidgets.QTextEdit()
        self.up_layout.addWidget(self.left_text,3,0)
        self.right_text = QtWidgets.QTextEdit()
        self.up_layout.addWidget(self.right_text,3,1)
       

        self.main_layout.addWidget(self.up_widget,0,0,1,1) 
        
        self.setCentralWidget(self.main_widget) # 設置窗口主部件

    def get_words(self):
        words = self.right_bar_widget_search_input.text()
        res_left, res_right = get_results(words)
        sizes_left = res_left['hits']['total']['value']
        self.left_text.setText("")
        self.left_label = QtWidgets.QLabel("全文檢索結果")
        self.left_text.append("<font size='3'>共檢索到<em> " + str(sizes_left) +  "</em> 條結果<br/></font>")
        for hit in res_left['hits']['hits']:
            self.left_text.append(
            "<div>"
            "<font color='red' size='3'>標題:" + hit["_source"]["標題"] + "<br/></font>"
            "<font size='3'>摘要:" + hit["_source"]["摘要"] + "<br/></font>"
            "<font size='3'>關鍵詞:" + hit["_source"]["關鍵詞"] + "<br/></font>"
            "<font size='3'>標引詞:" + hit["_source"]["標引詞"] + "<br/></font>"
            "<font color='black' size='3'>相關性:" + str(hit["_score"]) + "<br/></font>"
            "</div>"            
        )
        
        sizes_right = res_right['hits']['total']['value']
        self.right_text.setText("")
        self.right_text.append("<font size='3'>共檢索到<em> " + str(sizes_right) +  "</em> 條結果<br/></font>")
        for hit in res_right['hits']['hits']:
            self.right_text.append(
            "<div>"
            "<font color='red' size='3'>標題:" + hit["_source"]["標題"] + "<br/></font>"
            "<font size='3'>摘要:" + hit["_source"]["摘要"] + "<br/></font>"
            "<font size='3'>關鍵詞:" + hit["_source"]["關鍵詞"] + "<br/></font>"
            "<font size='3'>標引詞:" + hit["_source"]["標引詞"] + "<br/></font>"
            "<font color='black' size='3'>相關性:" + str(hit["_score"]) + "<br/></font>"
            "</div>"            
        )
       
def main():
    app = QtWidgets.QApplication(sys.argv)
    gui = MainUi()
    gui.show()
    sys.exit(app.exec_())

if __name__ == '__main__':
    main()
複製代碼
相關文章
相關標籤/搜索