搜索引擎–基於Django/Scrapy/ElasticSearch的搜索引擎的實現

主機環境：Ubuntu 13.04
Python版本：2.7.4
Django版本：1.5.4
Scrapy版本：0.18.2
ElasticSearch版本：0.90.5

原創做品，轉載請標明：http://blog.geekcome.com/archives/138

閒來無聊，查看了相關搜索引擎的基本知識，通過蒐集資料，瞭解了搜索引擎所須要的基本子系統，爬取子系統，索引服務子系統，Web請求和應答子系統。而後通過學習基本的開源框架文檔，集成的項目已經PUSH到GitHub。javascript

首先查看基於開源的Scrapy爬蟲框架編寫的一個爬蟲，爬取校園網的內容（主要是免流量）

 
       01 
       #!/usr/bin/env python 
      
       02 
       #-*- coding:utf-8 -*- 
      
       03 
       #from urlparse import urljoin 
      
       04 
       from scrapy.utils.url import urljoin_rfc 
      
       05 
       from scrapy.spider import BaseSpider 
      
       06 
       from scrapy.selector import HtmlXPathSelector 
      
       07 
       from scrapy.http import Request 
      
       08 
         
       09 
       from scrapy.exceptions import DropItem 
      
       10 
         
       11 
       from mymodules.items import Website 
      
       12 
         
       13 
       import urllib 
      
       14 
       import re 
      
       15 
         
       16 
       class Xidian_Spider(BaseSpider): 
      
       17 
           name = "xidian_spider" 
      
       18 
           start_urls = [ 
      
       19 
              "http://www.xidian.edu.cn", 
      
       20 
              #"http://rs.xidian.edu.cn/forum.php", 
      
       21 
         
       22 
           ] 
      
       23 
         
       24 
           def __init__(self): 
      
       25 
               """init the allowed_domain""" 
      
       26 
               self.allowed_domains = ['xidian.edu.cn'] 
      
       27 
         
       28 
           def parse(self, response): 
      
       29 
               """In this parse,we use double yeild to return the item or Request""" 
      
       30 
               hxs = HtmlXPathSelector(response) 
      
       31 
         
       32 
               refer_websites = hxs.select('//@href').extract() 
      
       33 
         
       34 
               #if not self.gethostname(response.url) in self.allowed_domains: 
      
       35 
               #    self.allowed_domains.append(self.gethostname(response.url)) 
      
       36 
         
       37 
               item = Website() 
      
       38 
               item['url'] = response.url 
      
       39 
               item['title'] = hxs.select('/html/head/title/text()').extract()[0] 
      
       40 
         
       41 
               """FIXME:This XPath select all the elements,include the javascript code.BAD!!""" 
      
       42 
               str = '' 
      
       43 
               list = hxs.select('/html/body//*/text()').extract() 
      
       44 
               for s in list: 
      
       45 
                   str += s.strip() 
      
       46 
                   str += ' ' 
      
       47 
         
       48 
               item['content'] = str 
      
       49 
         
       50 
               yield item 
      
       51 
         
       52 
               for weburl in refer_websites: 
      
       53 
         
       54 
                   utf8_url = weburl.encode('utf-8') 
      
       55 
         
       56 
                   """The following regex to match the prefix and postfix of urls""" 
      
       57 
                   postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$') 
      
       58 
                   prefix = re.compile(r'^((javascript:)|(openapi)).+') 
      
       59 
         
       60 
                   if postfix.match(utf8_url): 
      
       61 
                       continue 
      
       62 
                   if prefix.match(utf8_url): 
      
       63 
                       continue 
      
       64 
                   if not utf8_url.startswith('http://'): 
      
       65 
                       #weburl = urljoin_rfc(response.url, weburl, response.encoding) 
      
       66 
                       weburl = 'http://'+self.gethostname(response.url)+'/'+weburl 
      
       67 
         
       68 
                   weburl = re.sub(r'/\.\./\.\./',r'/',weburl) 
      
       69 
                   weburl = re.sub(r'/\.\./',r'/',weburl) 
      
       70 
         
       71 
                   yield Request(weburl, callback=self.parse) 
      
       72 
         
       73 
           def gethostname(self, res_url): 
      
       74 
               """get the host name of a url""" 
      
       75 
               proto, rest = urllib.splittype(res_url) 
      
       76 
               host, rest = urllib.splithost(rest) 
      
       77 
               return host

爬取獲得的ITEM會交給PIPELINE處理。

這裏的PipeLine作了去重處理，不能簡單的放在內容，因此使用的是Bloom Filter的算法，這裏直接安裝了Python的開源庫中的pybloomfilter（有時間研究一下）

 
       01 
       class DuplicatesPipeline(object): 
      
       02 
         
       03 
           def __init__(self): 
      
       04 
               self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') 
      
       05 
               self.f_write = open('visitedsites','w') 
      
       06 
               self.si = SearchIndex() 
      
       07 
               self.si.SearchInit() 
      
       08 
         
       09 
           def process_item(self, item, spider): 
      
       10 
               print '************%d pages visited!*****************' %len(self.bf) 
      
       11 
               if self.bf.add(item['url']):#True if item in the BF 
      
       12 
                   raise DropItem("Duplicate item found: %s" % item) 
      
       13 
               else: 
      
       14 
                   #print '%d pages visited!'% len(self.url_seen) 
      
       15 
                   self.save_to_file(item['url'],item['title']) 
      
       16 
                   self.si.AddIndex(item) 
      
       17 
                   return item 
      
       18 
         
       19 
           def save_to_file(self,url,utitle): 
      
       20 
               self.f_write.write(url) 
      
       21 
               self.f_write.write('\t') 
      
       22 
               self.f_write.write(utitle.encode('utf-8')) 
      
       23 
               self.f_write.write('\n') 
      
       24 
         
       25 
           def __del__(self): 
      
       26 
               """docstring for __del__""" 
      
       27 
               self.f_write.close() 
      
       28 
               self.si.IndexDone()

該類中的SearchIndex是ElasticSearch創建索引的類。定義以下：

 
       01 
       #!/usr/bin/env python 
      
       02 
       #-*- coding:utf-8-*- 
      
       03 
       import os 
      
       04 
       import sys 
      
       05 
       from pyes import * 
      
       06 
       from mymodules.items import Website 
      
       07 
       INDEX_NAME='xidian_spider' 
      
       08 
         
       09 
       class SearchIndex(object): 
      
       10 
         
       11 
           def SearchInit(self): 
      
       12 
               self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES 
      
       13 
               try: 
      
       14 
                   self.conn.delete_index(INDEX_NAME) 
      
       15 
                   #pass 
      
       16 
               except: 
      
       17 
                   pass 
      
       18 
               self.conn.create_index(INDEX_NAME)#Create a new INDEX 
      
       19 
         
       20 
               #Define the structure of the data format 
      
       21 
               mapping = {u'content': {'boost': 1.0, 
      
       22 
                                 'index': 'analyzed', 
      
       23 
                                 'store': 'yes', 
      
       24 
                                 'type': u'string', 
      
       25 
                                 "indexAnalyzer":"ik", 
      
       26 
                                 "searchAnalyzer":"ik", 
      
       27 
                                 "term_vector" : "with_positions_offsets"}, 
      
       28 
                         u'title': {'boost': 1.0, 
      
       29 
                                    'index': 'analyzed', 
      
       30 
                                    'store': 'yes', 
      
       31 
                                    'type': u'string', 
      
       32 
                                    "indexAnalyzer":"ik", 
      
       33 
                                    "searchAnalyzer":"ik", 
      
       34 
                                    "term_vector" : "with_positions_offsets"}, 
      
       35 
                         u'url': {'boost': 1.0, 
      
       36 
                                    'index': 'analyzed', 
      
       37 
                                    'store': 'yes', 
      
       38 
                                    'type': u'string', 
      
       39 
                                    #"indexAnalyzer":"ik", 
      
       40 
                                    #"searchAnalyzer":"ik", 
      
       41 
                                    "term_vector" : "with_positions_offsets"}, 
      
       42 
               } 
      
       43 
         
       44 
               self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type 
      
       45 
         
       46 
           def AddIndex(self,item): 
      
       47 
         
       48 
               print 'Adding Index item URL %s'% item['title'].encode('utf-8') 
      
       49 
               self.conn.index({'title':item['title'].encode('utf-8'), \ 
      
       50 
                       'url':item['url'].encode('utf-8'),\