第三百六十八節,Python分佈式爬蟲打造搜索引擎Scrapy精講—用Django實現搜索的自動補全功能javascript
elasticsearch(搜索引擎)提供了自動補全接口css
官方說明:https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.htmlhtml
一、建立搜索自動補全字段suggestjava
自動補全須要用到一個字段名稱爲suggest類型爲Completion類型的一個字段python
因此咱們須要用將前面的elasticsearch-dsl操做elasticsearch(搜索引擎)增長suggest類型爲Completionjquery
注意:由於elasticsearch-dsl源碼問題,設置字段爲Completion類型指定分詞器時會報錯,因此咱們須要重寫CustomAnalyzer類ajax
只有Completion類型纔是,其餘類型不用,其餘類型直接指定分詞器便可數據庫
#!/usr/bin/env python from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer # 更多字段類型見第三百六十四節elasticsearch(搜索引擎)的mapping映射管理 from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer #導入CustomAnalyzer類 from elasticsearch_dsl.connections import connections # 導入鏈接elasticsearch(搜索引擎)服務器方法 connections.create_connection(hosts=['127.0.0.1']) class CustomAnalyzer(_CustomAnalyzer): # 自定義CustomAnalyzer類,來重寫CustomAnalyzer類 def get_analysis_definition(self): return {} ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) # 實例化重寫的CustomAnalyzer類傳入分詞器和大小寫轉,將大寫轉換成小寫 class lagouType(DocType): # 自定義一個類來繼承DocType類 suggest = Completion(analyzer=ik_analyzer) # Text類型須要分詞,因此須要知道中文分詞器,ik_max_wordwei爲中文分詞器 title = Text(analyzer="ik_max_word") # 設置,字段名稱=字段類型,Text爲字符串類型而且能夠分詞創建倒排索引 description = Text(analyzer="ik_max_word") keywords = Text(analyzer="ik_max_word") url = Keyword() # 設置,字段名稱=字段類型,Keyword爲普通字符串類型,不分詞 riqi = Date() # 設置,字段名稱=字段類型,Date日期類型 class Meta: # Meta是固定寫法 index = "lagou" # 設置索引名稱(至關於數據庫名稱) doc_type = 'biao' # 設置表名稱 if __name__ == "__main__": # 判斷在本代碼文件執行才執行裏面的方法,其餘頁面調用的則不執行裏面的方法 lagouType.init() # 生成elasticsearch(搜索引擎)的索引,表,字段等信息 # 使用方法說明: # 在要要操做elasticsearch(搜索引擎)的頁面,導入此模塊 # lagou = lagouType() #實例化類 # lagou.title = '值' #要寫入字段=值 # lagou.description = '值' # lagou.keywords = '值' # lagou.url = '值' # lagou.riqi = '值' # lagou.save() #將數據寫入elasticsearch(搜索引擎)
二、搜索自動補全字段suggest寫入數據django
搜索自動補全字段suggest接收的要搜索的字段分詞數據,詳情見下面的自定義分詞函數json
elasticsearch-dsl操做elasticsearch(搜索引擎)
#!/usr/bin/env python # -*- coding:utf8 -*- #!/usr/bin/env python from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer from elasticsearch_dsl.connections import connections # 導入鏈接elasticsearch(搜索引擎)服務器方法 # 更多字段類型見第三百六十四節elasticsearch(搜索引擎)的mapping映射管理 from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer #導入CustomAnalyzer類 connections.create_connection(hosts=['127.0.0.1']) class CustomAnalyzer(_CustomAnalyzer): # 自定義CustomAnalyzer類,來重寫CustomAnalyzer類 def get_analysis_definition(self): return {} ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) # 實例化重寫的CustomAnalyzer類傳入分詞器和大小寫轉,將大寫轉換成小寫 class lagouType(DocType): # 自定義一個類來繼承DocType類 suggest = Completion(analyzer=ik_analyzer) # Text類型須要分詞,因此須要知道中文分詞器,ik_max_wordwei爲中文分詞器 title = Text(analyzer="ik_max_word") # 設置,字段名稱=字段類型,Text爲字符串類型而且能夠分詞創建倒排索引 description = Text(analyzer="ik_max_word") keywords = Text(analyzer="ik_max_word") url = Keyword() # 設置,字段名稱=字段類型,Keyword爲普通字符串類型,不分詞 riqi = Date() # 設置,字段名稱=字段類型,Date日期類型 class Meta: # Meta是固定寫法 index = "lagou" # 設置索引名稱(至關於數據庫名稱) doc_type = 'biao' # 設置表名稱 def gen_suggest(index, info_tuple): # 根據字符串生成搜索建議數組 """ 此函數主要用於,鏈接elasticsearch(搜索引擎),使用ik_max_word分詞器,將傳入的字符串進行分詞,返回分詞後的結果 此函數須要兩個參數: 第一個參數:要調用elasticsearch(搜索引擎)分詞的索引index,通常是(索引操做類._doc_type.index) 第二個參數:是一個元組,元祖的元素也是元組,元素元祖裏有兩個值一個是要分詞的字符串,第二個是分詞的權重,多個分詞傳多個元祖以下 書寫格式: gen_suggest(lagouType._doc_type.index, (('字符串', 10),('字符串', 8))) """ es = connections.create_connection(lagouType._doc_type.using) # 鏈接elasticsearch(搜索引擎),使用操做搜索引擎的類下面的_doc_type.using鏈接 used_words = set() suggests = [] for text, weight in info_tuple: if text: # 調用es的analyze接口分析字符串, words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter':["lowercase"]}, body=text) anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1]) new_words = anylyzed_words - used_words else: new_words = set() if new_words: suggests.append({"input":list(new_words), "weight":weight}) # 返回分詞後的列表,裏面是字典, # 如:[{'input': ['錄音', '廣告'], 'weight': 10}, {'input': ['新能源', '汽車',], 'weight': 8}] return suggests if __name__ == "__main__": # 判斷在本代碼文件執行才執行裏面的方法,其餘頁面調用的則不執行裏面的方法 lagouType.init() # 生成elasticsearch(搜索引擎)的索引,表,字段等信息 # 使用方法說明: # 在要要操做elasticsearch(搜索引擎)的頁面,導入此模塊 # lagou = lagouType() #實例化類 # lagou.title = '值' #要寫入字段=值 # lagou.description = '值' # lagou.keywords = '值' # lagou.url = '值' # lagou.riqi = '值' # lagou.save() #將數據寫入elasticsearch(搜索引擎)
suggest字段寫入數據
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html # items.py,文件是專門用於,接收爬蟲獲取到的數據信息的,就至關因而容器文件 import scrapy from scrapy.loader.processors import MapCompose, TakeFirst from scrapy.loader import ItemLoader # 導入ItemLoader類也就加載items容器類填充數據 from adc.models.elasticsearch_orm import lagouType, gen_suggest # 導入elasticsearch操做模塊 class LagouItemLoader(ItemLoader): # 自定義Loader繼承ItemLoader類,在爬蟲頁面調用這個類填充數據到Item類 default_output_processor = TakeFirst() # 默認利用ItemLoader類,加載items容器類填充數據,是列表類型,能夠經過TakeFirst()方法,獲取到列表裏的內容 def tianjia(value): # 自定義數據預處理函數 return value # 將處理後的數據返給Item class LagouItem(scrapy.Item): # 設置爬蟲獲取到的信息容器類 title = scrapy.Field( # 接收爬蟲獲取到的title信息 input_processor=MapCompose(tianjia), # 將數據預處理函數名稱傳入MapCompose方法裏處理,數據預處理函數的形式參數value會自動接收字段title ) description = scrapy.Field() keywords = scrapy.Field() url = scrapy.Field() riqi = scrapy.Field() def save_to_es(self): lagou = lagouType() # 實例化elasticsearch(搜索引擎對象) lagou.title = self['title'] # 字段名稱=值 lagou.description = self['description'] lagou.keywords = self['keywords'] lagou.url = self['url'] lagou.riqi = self['riqi'] # 將title和keywords數據傳入分詞函數,進行分詞組合後返回寫入搜索建議字段suggest lagou.suggest = gen_suggest(lagouType._doc_type.index, ((lagou.title, 10),(lagou.keywords, 8))) lagou.save() # 將數據寫入elasticsearch(搜索引擎對象) return
寫入elasticsearch(搜索引擎)後的狀況
{
"_index": "lagou",
"_type": "biao",
"_id": "AV5MDu0NXJs9MkF5tFxW",
"_version": 1,
"_score": 1,
"_source": {
"title": "LED光催化滅蚊燈廣告錄音_廣告錄音網-火紅廣告錄音_叫賣錄音下載_語音廣告製做",
"keywords": "各種小商品,廣告錄音,叫賣錄音,火紅廣告錄音",
"url": "http://www.luyin.org/post/2486.html",
"suggest": [
{
"input": [
"廣告"
,
"火紅"
,
"製做"
,
"叫賣"
,
"滅蚊燈"
,
"語音"
,
"下載"
,
"led"
,
"錄音"
,
"滅蚊"
,
"光催化"
,
"催化"
],
"weight": 10
}
,
{
"input": [
"小商品"
,
"廣告"
,
"各種"
,
"火紅"
,
"叫賣"
,
"商品"
,
"小商"
,
"錄音"
],
"weight": 8
}
],
"riqi": "2017-09-04T16:43:20",
"description": "LED光催化滅蚊燈廣告錄音 是廣告錄音網-火紅廣告錄音中一篇關於 各種小商品 的文章,歡迎您閱讀和評論,專業叫賣錄音-廣告錄音-語音廣告製做"
}
}
用Django實現搜索的自動補全功能說明
1.將搜索框綁定一個事件,每輸入一個字觸發這個事件,獲取到輸入框裏的內容,用ajax將輸入的詞請求到Django的邏輯處理函數。
2.在邏輯處理函數裏,將請求詞用elasticsearch(搜索引擎)的fuzzy模糊查詢,查詢suggest字段裏存在請求詞的數據,將查詢到的數據添加到自動補全
html代碼:
<!DOCTYPE html > <html xmlns="http://www.w3.org/1999/xhtml"> {#引入靜態文件路徑#} {% load staticfiles %} <head> <meta http-equiv="X-UA-Compatible" content="IE=emulateIE7" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>lcv-search 搜索引擎</title> <link href="{% static 'css/style.css'%}" rel="stylesheet" type="text/css" /> <link href="{% static 'css/index.css'%}" rel="stylesheet" type="text/css" /> </head> <body> <div id="container"> <div id="bd"> <div id="main"> <h1 class="title"> <div class="logo large"></div> </h1> <div class="nav ue-clear"> <ul class="searchList"> <li class="searchItem current" data-type="article">文章</li> <li class="searchItem" data-type="question">問答</li> <li class="searchItem" data-type="job">職位</li> </ul> </div> <div class="inputArea"> {% csrf_token %} <input type="text" class="searchInput" /> <input type="button" class="searchButton" onclick="add_search()" /> <ul class="dataList"> <li>如何學好設計</li> <li>界面設計</li> <li>UI設計培訓要多少錢</li> <li>設計師學習</li> <li>哪裏有好的網站</li> </ul> </div> <div class="historyArea"> <p class="history"> <label>熱門搜索:</label> </p> <p class="history mysearch"> <label>個人搜索:</label> <span class="all-search"> <a href="javascript:;">專一界面設計網站</a> <a href="javascript:;">用戶體驗</a> <a href="javascript:;">互聯網</a> <a href="javascript:;">資費套餐</a> </span> </p> </div> </div><!-- End of main --> </div><!--End of bd--> <div class="foot"> <div class="wrap"> <div class="copyright">Copyright ©uimaker.com 版權全部 E-mail:admin@uimaker.com</div> </div> </div> </div> </body> <script type="text/javascript" src="{% static 'js/jquery.js'%}"></script> <script type="text/javascript" src="{% static 'js/global.js'%}"></script> <script type="text/javascript"> var suggest_url = "/suggest/" var search_url = "/search/" $('.searchList').on('click', '.searchItem', function(){ $('.searchList .searchItem').removeClass('current'); $(this).addClass('current'); }); function removeByValue(arr, val) { for(var i=0; i<arr.length; i++) { if(arr[i] == val) { arr.splice(i, 1); break; } } } // 搜索建議 $(function(){ $('.searchInput').bind(' input propertychange ',function(){ var searchText = $(this).val(); var tmpHtml = "" $.ajax({ cache: false, type: 'get', dataType:'json', url:suggest_url+"?s="+searchText+"&s_type="+$(".searchItem.current").attr('data-type'), async: true, success: function(data) { for (var i=0;i<data.length;i++){ tmpHtml += '<li><a href="'+search_url+'?q='+data[i]+'">'+data[i]+'</a></li>' } $(".dataList").html("") $(".dataList").append(tmpHtml); if (data.length == 0){ $('.dataList').hide() }else { $('.dataList').show() } } }); } ); }) hideElement($('.dataList'), $('.searchInput')); </script> <script> var searchArr; //定義一個search的,判斷瀏覽器有無數據存儲(搜索歷史) if(localStorage.search){ //若是有,轉換成 數組的形式存放到searchArr的數組裏(localStorage以字符串的形式存儲,因此要把它轉換成數組的形式) searchArr= localStorage.search.split(",") }else{ //若是沒有,則定義searchArr爲一個空的數組 searchArr = []; } //把存儲的數據顯示出來做爲搜索歷史 MapSearchArr(); function add_search(){ var val = $(".searchInput").val(); if (val.length>=2){ //點擊搜索按鈕時,去重 KillRepeat(val); //去重後把數組存儲到瀏覽器localStorage localStorage.search = searchArr; //而後再把搜索內容顯示出來 MapSearchArr(); } window.location.href=search_url+'?q='+val+"&s_type="+$(".searchItem.current").attr('data-type') } function MapSearchArr(){ var tmpHtml = ""; var arrLen = 0 if (searchArr.length >= 5){ arrLen = 5 }else { arrLen = searchArr.length } for (var i=0;i<arrLen;i++){ tmpHtml += '<a href="'+search_url+'?q='+searchArr[i]+'">'+searchArr[i]+'</a>' } $(".mysearch .all-search").html(tmpHtml); } //去重 function KillRepeat(val){ var kill = 0; for (var i=0;i<searchArr.length;i++){ if(val===searchArr[i]){ kill ++; } } if(kill<1){ searchArr.unshift(val); }else { removeByValue(searchArr, val) searchArr.unshift(val) } } </script> </html>
Django路由映射
"""pachong URL Configuration The `urlpatterns` list routes URLs to views. For more information please see: https://docs.djangoproject.com/en/1.10/topics/http/urls/ Examples: Function views 1. Add an import: from my_app import views 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') Class-based views 1. Add an import: from other_app.views import Home 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') Including another URLconf 1. Import the include() function: from django.conf.urls import url, include 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) """ from django.conf.urls import url from django.contrib import admin from app1 import views urlpatterns = [ url(r'^admin/', admin.site.urls), url(r'^$', views.indexluoji), url(r'^index/', views.indexluoji), url(r'^suggest/$', views.suggestluoji,name="suggest"), # 搜索字段補全請求 ]
Django靜態文件配置
# Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ #配置靜態文件前綴 STATIC_URL = '/static/' #配置靜態文件目錄 STATICFILES_DIRS = [ os.path.join(BASE_DIR, 'static') ]
備註:搜索自動補全fuzzy查詢
#搜索自動補全fuzzy查詢 POST lagou/biao/_search?pretty { "suggest":{ #字段名稱 "my_suggest":{ #自定義變量 "text":"廣告", #搜索詞 "completion":{ "field":"suggest", #搜索字段 "fuzzy":{ "fuzziness":1 #編輯距離 } } } }, "_source":"title" }
Django邏輯處理文件
from django.shortcuts import render # Create your views here. from django.shortcuts import render,HttpResponse from django.views.generic.base import View from app1.models import lagouType #導入操做elasticsearch(搜索引擎)類 import json def indexluoji(request): print(request.method) # 獲取用戶請求的路徑 return render(request, 'index.html') def suggestluoji(request): # 搜索自動補全邏輯處理 key_words = request.GET.get('s', '') # 獲取到請求詞 re_datas = [] if key_words: s = lagouType.search() # 實例化elasticsearch(搜索引擎)類的search查詢 s = s.suggest('my_suggest', key_words, completion={ "field": "suggest", "fuzzy": { "fuzziness": 2 }, "size": 5 }) suggestions = s.execute_suggest() for match in suggestions.my_suggest[0].options: source = match._source re_datas.append(source["title"]) return HttpResponse(json.dumps(re_datas), content_type="application/json")
最終完成