本地有不少文本文件,都是一些知識點,如今將他們批量載入ES中。python
有點亂,未整理,但能用。json
這裏要注意的是str的decode方法,須要加上參數'ignore',由於在實際使用中會發生轉換異常,緣由我也沒有想太明白。加上ignore會跳過這些錯誤,沒有太大影響(出現問題是一些符號字符,而不是英文,中文,也可能有其餘情況我沒有碰到)。api
#-*-coding:UTF-8-*- __author__='zhaoxp' import elasticsearch import json import os import os.path from datetime import datetime '''create my knowledgebase(kb) ''' KB_OS_PATH='E:\\study\\my_knowledgebase\\Unix&Linux&AIX&Windows' def create_kb_index_type(es, index, doc_type): res = es.index(index=index, doc_type=doc_type, body={}) print('create index&type result ') print(json.dumps(res, indent=2)) return res def create_index(es,index): re = es.indices.create(index=index) print('create index result') print(json.dumps(re)) def del_index(es, index): try: re = es.indices.delete(index=index) print('delete index result') print(json.dumps(re)) return re except elasticsearch.exceptions.NotFoundError as nfe: print('del_index error: index(%s) not found'%index) return None def get_index(es, index): try: re = es.indices.get(index=index) return re except elasticsearch.exceptions.NotFoundError as nfe: print('get_index error: index(%s) not found'%index) return None def load_os_knowledge_to_kb(es): print('load OS knowledge from local files into KB') print('KB_OS_PATH = %s'%KB_OS_PATH) file_list = os.listdir(KB_OS_PATH) for file_name in file_list: if file_name[-4:] == '.txt': print('-'*20+file_name+'-'*20) insert_file_content_to_es(es,KB_OS_PATH,file_name) def insert_file_content_to_es(es,file_dir, file_name): print('insert content of file into ES %s/%s'%(file_dir,file_name)) with open(os.path.join(file_dir,file_name),'rb') as f: try: file_name = file_name[:-4].decode('gbk','ignore') file_content = f.read().decode('gbk','ignore') body = {'file_name':file_name,'author':'zhaoxp','source':'manual write','message':file_content} try: re = es.index(index='kb' ,doc_type='OS' ,body=body) print 'processing %s'%file_name print re except Exception as e: print('Exception : %s - %s'%(file_name,e)) except UnicodeDecodeError as ude: print('UnicodeDecodeError during processing %s'%file_name) def test_chinese(es): dir_path='D:\\temp\\t1' for fname in os.listdir(dir_path): print(fname) print(type(fname)) with open(os.path.join(dir_path,fname),'rb') as f: content = f.read() print(content) print(type(content)) body = {'message':'chinese words:你好,世界', 'title':fname, 'file_content':content} re = es.index(index='kb', doc_type='os', body=body) print('test chinese') print(json.dumps(re, indent=2)) if __name__=='__main__': print('test ES api') es = elasticsearch.Elasticsearch(hosts= [{'host':'10.120.20.206','port':9200},{'host':'10.120.20.205','port':9200}]) if es.ping(): print('ES connected.') #del_index(es, 'kb') print es.indices print dir(es.indices) #re = get_index(es,'bank') #print(json.dumps(re, indent=2)) #del_index(es,'kb') print('start to load') load_os_knowledge_to_kb(es) #test_chinese(es) # #print('create index') #create_index(es,'kb') #print('create index kb and doc_type os') #create_kb_index_type(es,'kb','os') #print dir(es) #doc = {'author': 'kimchy','text': 'Elasticsearch: cool. bonsai cool.','timestamp': datetime.now(),} #res = create_index(es,index_name='kb',doc_type='test',body=doc) #res = get_index(es, index_name='kb', doc_type='test', id='AVj2riiT4esYedJ276IG') else: print('ES not connected')