將我本身的知識庫文件加載到Elasticsearch中。

本地有不少文本文件,都是一些知識點,如今將他們批量載入ES中。python

有點亂,未整理,但能用。json

這裏要注意的是str的decode方法,須要加上參數'ignore',由於在實際使用中會發生轉換異常,緣由我也沒有想太明白。加上ignore會跳過這些錯誤,沒有太大影響(出現問題是一些符號字符,而不是英文,中文,也可能有其餘情況我沒有碰到)。api

#-*-coding:UTF-8-*-

__author__='zhaoxp'

import elasticsearch
import json
import os
import os.path

from datetime import datetime

'''create my knowledgebase(kb)

'''

KB_OS_PATH='E:\\study\\my_knowledgebase\\Unix&Linux&AIX&Windows'

def create_kb_index_type(es, index, doc_type):
    res = es.index(index=index, doc_type=doc_type, body={})
    print('create index&type result ')
    print(json.dumps(res, indent=2))
    return res


def create_index(es,index):
    re = es.indices.create(index=index)
    print('create index result')
    print(json.dumps(re))


def del_index(es, index):
    try:
        re = es.indices.delete(index=index)
        print('delete index result')
        print(json.dumps(re))
        return re
    except elasticsearch.exceptions.NotFoundError as nfe:
        print('del_index error: index(%s) not found'%index)
        return None


def get_index(es, index):
    try:
        re = es.indices.get(index=index)
        return re
    except elasticsearch.exceptions.NotFoundError as nfe:
        print('get_index error: index(%s) not found'%index)
        return None


def load_os_knowledge_to_kb(es):
    print('load OS knowledge from local files into KB')
    print('KB_OS_PATH = %s'%KB_OS_PATH)
    file_list = os.listdir(KB_OS_PATH)
    for file_name in file_list:
        if file_name[-4:] == '.txt':
            print('-'*20+file_name+'-'*20)
            insert_file_content_to_es(es,KB_OS_PATH,file_name)


def insert_file_content_to_es(es,file_dir, file_name):
    print('insert content of file into ES %s/%s'%(file_dir,file_name))
    with open(os.path.join(file_dir,file_name),'rb') as f:
        try:
            file_name = file_name[:-4].decode('gbk','ignore')
            file_content = f.read().decode('gbk','ignore')
            body = {'file_name':file_name,'author':'zhaoxp','source':'manual write','message':file_content}
            try:
                re = es.index(index='kb' ,doc_type='OS' ,body=body)
                print 'processing %s'%file_name
                print re
            except Exception as e:
                print('Exception : %s - %s'%(file_name,e))
        except UnicodeDecodeError as ude:
            print('UnicodeDecodeError during processing %s'%file_name)


def test_chinese(es):
    dir_path='D:\\temp\\t1'
    for fname in os.listdir(dir_path):
        print(fname)
        print(type(fname))
        with open(os.path.join(dir_path,fname),'rb') as f:
            content = f.read()
            print(content)
            print(type(content))
            body = {'message':'chinese words:你好,世界',
                'title':fname,
                'file_content':content}
            re = es.index(index='kb', doc_type='os', body=body)
            print('test chinese')
            print(json.dumps(re, indent=2))


if __name__=='__main__':
    print('test ES api')
    es = elasticsearch.Elasticsearch(hosts=
        [{'host':'10.120.20.206','port':9200},{'host':'10.120.20.205','port':9200}])
    if es.ping():
        print('ES connected.')
        #del_index(es, 'kb')
        print es.indices
        print dir(es.indices)
        #re = get_index(es,'bank')
        #print(json.dumps(re, indent=2))
        #del_index(es,'kb')
        print('start to load')
        load_os_knowledge_to_kb(es)
        #test_chinese(es)
        #
        #print('create index')
        #create_index(es,'kb')
        #print('create index kb and doc_type os')
        #create_kb_index_type(es,'kb','os')
        #print dir(es)
        #doc = {'author': 'kimchy','text': 'Elasticsearch: cool. bonsai cool.','timestamp': datetime.now(),}
        #res = create_index(es,index_name='kb',doc_type='test',body=doc)
        #res = get_index(es, index_name='kb', doc_type='test', id='AVj2riiT4esYedJ276IG')
    else:
        print('ES not connected')
相關文章
相關標籤/搜索