歷史裏本身寫的一段檢索的代碼

時間 2019-11-19
原文原文鏈接
  1 # coding: utf-8
  2 import codecs
  3 from bson.json_util import dumps
  4 from config import s_host, s_port, s_indexer
  5 from models.gatherinfos import opt_infos
  6 import sphinxapi
  7 
  8 def getSearchEngine():
  9     """
 10     獲取搜索引擎的查詢接口
 11     :return:
 12     """
 13     spc = sphinxapi.SphinxClient()
 14     spc.SetServer(s_host, s_port)
 15     spc.SetMatchMode(sphinxapi.SPH_MATCH_ANY)
 16     return spc
 17 def getQueryResult(keyword,skip=0,limit=100000):
 18     """
 19     獲取默認的查詢結果
 20     :param keyword:查詢關鍵字
 21     :param skip: 起始位置
 22     :param limit: 限制返回數據數量
 23     :return:返回一個字典，結構相似如下.同時返回搜索引擎的實例
 24         {'attrs': [['gathertime', 2]],
 25          'error': '',
 26          'fields': ['department',
 27                     'title',
 28                     'href',
 29                     'postdetail',
 30                     'cuttitle',
 31                     'cutdetail'],
 32          'matches': [{'attrs': {'gathertime': 2013}, 'id': 1794, 'weight': 4},
 33                      {'attrs': {'gathertime': 2013}, 'id': 1836, 'weight': 4},
 34                      {'attrs': {'gathertime': 2013}, 'id': 1845, 'weight': 4},
 35                      ],
 36          'status': 0,
 37          'time': '0.001',
 38          'total': 470,
 39          'total_found': 470,
 40          'warning': '',
 41          'words': [{'docs': 470, 'hits': 1655, 'word': '\xe5\x88\x9b\xe6\x96\xb0'}]}
 42     """
 43     spc=getSearchEngine()
 44     #設置偏移量，容許分頁
 45     spc.SetLimits(skip, limit)
 46     # 檢索關鍵字"矢量"
 47     spc.SetMatchMode(0)
 48     spc.SetSortMode(sphinxapi.SPH_SORT_TIME_SEGMENTS,"gathertime")
 49     result = spc.Query(keyword, s_indexer)
 50     return result,spc
 51 
 52 def getQueryResultEx(keyword,skip=0,limit=10000):
 53     r,spc=getQueryResult(keyword,skip=skip,limit=limit)
 54     listall=[]
 55     for m in r["matches"]:
 56         id= m['id']
 57         msg = opt_infos.GetRowByStrWhere(" id=%s"%id)
 58         all=dict(m,**msg)
 59         del(all['attrs'])
 60         del(all['author'])
 61         del(all['cutdetail'])
 62         del(all['cuttitle'])
 63         del(all['posttime'])
 64         listall.append(all)
 65     r["datas"]=listall
 66     return r,spc
 67 def getHighlightContent(r,spc,keywords):
 68     """
 69     返回用於搜索結果顯示的頁面使用的列表
 70     :param r: 搜索引擎檢索結果，通過數據庫的配對後的內容
 71     :param spc: 搜索引擎實例
 72     :param keywords: 最初的搜索關鍵字
 73     :return:
 74     返回值列表中每一個數據是一個字典。
 75     字典中包括三個屬性，結構以下
 76     [
 77         {"title":"","content":"","url":""},
 78         {"title":"","content":"","url":""},
 79         ......
 80     ]
 81     """
 82     opts={
 83          'before_match':"<code>",
 84         'after_match':'</code>',
 85         'chunk_separator':' ... ',
 86         'around':3
 87     }
 88     lista=[]    #存放標題
 89     listb=[]    #存放主體內容
 90     listc=[]    #存放連接地址
 91     listd=[]    #存放來源網站
 92     for d in r["datas"]:
 93         a=d.get('title',"")
 94         b=d.get('postdetail',"")
 95         c=d.get('href',"")
 96         d=d.get('department',"")
 97         lista.append(a)
 98         listb.append(b)
 99         listc.append(c)
100         listd.append(d)
101     #標題與內容分別提取關鍵詞定位點的內容
102     lista=spc.BuildExcerpts(lista,'mysql',keywords,opts)
103     listb=spc.BuildExcerpts(listb,'mysql',keywords,opts)
104     recs=[]
105     i=0
106     for a in lista:
107         a=a
108         b=listb[i]
109         c=listc[i]
110         d=listd[i]
111         r={
112             "title":a,
113             "content":b,
114             "url":c,
115             "from":d
116         }
117         i+=1
118         recs.append(r)
119     return recs
120 
121 def GingerSearch(keywords,skip=0,limit=100000):
122     """
123     再次封裝，讓搜索引擎更容易使用
124     :param keywords:查詢關鍵字
125     :param skip: 起始位置
126     :param limit: 限制返回記錄數量
127     :return:給頁面使用的列表,同時也返回搜索引擎給出的原始搜索結果
128     返回值列表中每一個數據是一個字典。
129     字典中包括三個屬性，結構以下
130     [
131         {"title":"","content":"","url":""},
132         {"title":"","content":"","url":""},
133         ......
134     ]
135     """
136     r,spc=getQueryResultEx(keywords,skip,limit)
137     result=getHighlightContent(r,spc,keywords)
138     return result,r
139 if __name__=="__main__":
140     result=GingerSearch("科技創新")
141     stra=dumps(result,ensure_ascii=False,indent=2)
142     f=codecs.open("1.txt","wb","utf8")
143     f.write(stra)
144     f.close()