搜索引擎–基於Django/Scrapy/ElasticSearch的搜索引擎的實現

  • 主機環境:Ubuntu 13.04
  • Python版本:2.7.4
  • Django版本:1.5.4
  • Scrapy版本:0.18.2
  • ElasticSearch版本:0.90.5

原創做品,轉載請標明:http://blog.geekcome.com/archives/138

閒來無聊,查看了相關搜索引擎的基本知識,通過蒐集資料,瞭解了搜索引擎所須要的基本子系統,爬取子系統,索引服務子系統,Web請求和應答子系統。而後通過學習基本的開源框架文檔,集成的項目已經PUSH到GitHubjavascript

首先查看基於開源的Scrapy爬蟲框架編寫的一個爬蟲,爬取校園網的內容(主要是免流量)

01 #!/usr/bin/env python
02 #-*- coding:utf-8 -*-
03 #from urlparse import urljoin
04 from scrapy.utils.url import urljoin_rfc
05 from scrapy.spider import BaseSpider
06 from scrapy.selector import HtmlXPathSelector
07 from scrapy.http import Request
08  
09 from scrapy.exceptions import DropItem
10  
11 from mymodules.items import Website
12  
13 import urllib
14 import re
15  
16 class Xidian_Spider(BaseSpider):
17     name = "xidian_spider"
18     start_urls = [
19        "http://www.xidian.edu.cn",
20        #"http://rs.xidian.edu.cn/forum.php",
21  
22     ]
23  
24     def __init__(self):
25         """init the allowed_domain"""
26         self.allowed_domains = ['xidian.edu.cn']
27  
28     def parse(self, response):
29         """In this parse,we use double yeild to return the item or Request"""
30         hxs = HtmlXPathSelector(response)
31  
32         refer_websites = hxs.select('//@href').extract()
33  
34         #if not self.gethostname(response.url) in self.allowed_domains:
35         #    self.allowed_domains.append(self.gethostname(response.url))
36  
37         item = Website()
38         item['url'= response.url
39         item['title'= hxs.select('/html/head/title/text()').extract()[0]
40  
41         """FIXME:This XPath select all the elements,include the javascript code.BAD!!"""
42         str = ''
43         list = hxs.select('/html/body//*/text()').extract()
44         for in list:
45             str += s.strip()
46             str += ' '
47  
48         item['content'= str
49  
50         yield item
51  
52         for weburl in refer_websites:
53  
54             utf8_url = weburl.encode('utf-8')
55  
56             """The following regex to match the prefix and postfix of urls"""
57             postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$')
58             prefix = re.compile(r'^((javascript:)|(openapi)).+')
59  
60             if postfix.match(utf8_url):
61                 continue
62             if prefix.match(utf8_url):
63                 continue
64             if not utf8_url.startswith('http://'):
65                 #weburl = urljoin_rfc(response.url, weburl, response.encoding)
66                 weburl = 'http://'+self.gethostname(response.url)+'/'+weburl
67  
68             weburl = re.sub(r'/\.\./\.\./',r'/',weburl)
69             weburl = re.sub(r'/\.\./',r'/',weburl)
70  
71             yield Request(weburl, callback=self.parse)
72  
73     def gethostname(self, res_url):
74         """get the host name of a url"""
75         proto, rest = urllib.splittype(res_url)
76         host, rest = urllib.splithost(rest)
77         return host

爬取獲得的ITEM會交給PIPELINE處理。

這裏的PipeLine作了去重處理,不能簡單的放在內容,因此使用的是Bloom Filter的算法,這裏直接安裝了Python的開源庫中的pybloomfilter(有時間研究一下)

01 class DuplicatesPipeline(object):
02  
03     def __init__(self):
04         self.bf = BloomFilter(100000000.01'filter.bloom')
05         self.f_write = open('visitedsites','w')
06         self.si = SearchIndex()
07         self.si.SearchInit()
08  
09     def process_item(self, item, spider):
10         print '************%d pages visited!*****************' %len(self.bf)
11         if self.bf.add(item['url']):#True if item in the BF
12             raise DropItem("Duplicate item found: %s" % item)
13         else:
14             #print '%d pages visited!'% len(self.url_seen)
15             self.save_to_file(item['url'],item['title'])
16             self.si.AddIndex(item)
17             return item
18  
19     def save_to_file(self,url,utitle):
20         self.f_write.write(url)
21         self.f_write.write('\t')
22         self.f_write.write(utitle.encode('utf-8'))
23         self.f_write.write('\n')
24  
25     def __del__(self):
26         """docstring for __del__"""
27         self.f_write.close()
28         self.si.IndexDone()

該類中的SearchIndex是ElasticSearch創建索引的類。定義以下:

01 #!/usr/bin/env python
02 #-*- coding:utf-8-*-
03 import os
04 import sys
05 from pyes import *
06 from mymodules.items import Website
07 INDEX_NAME='xidian_spider'
08  
09 class SearchIndex(object):
10  
11     def SearchInit(self):
12         self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES
13         try:
14             self.conn.delete_index(INDEX_NAME)
15             #pass
16         except:
17             pass
18         self.conn.create_index(INDEX_NAME)#Create a new INDEX
19  
20         #Define the structure of the data format
21         mapping = {u'content': {'boost'1.0,
22                           'index''analyzed',
23                           'store''yes',
24                           'type': u'string',
25                           "indexAnalyzer":"ik",
26                           "searchAnalyzer":"ik",
27                           "term_vector" "with_positions_offsets"},
28                   u'title': {'boost'1.0,
29                              'index''analyzed',
30                              'store''yes',
31                              'type': u'string',
32                              "indexAnalyzer":"ik",
33                              "searchAnalyzer":"ik",
34                              "term_vector" "with_positions_offsets"},
35                   u'url': {'boost'1.0,
36                              'index''analyzed',
37                              'store''yes',
38                              'type': u'string',
39                              #"indexAnalyzer":"ik",
40                              #"searchAnalyzer":"ik",
41                              "term_vector" "with_positions_offsets"},
42         }
43  
44         self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type
45  
46     def AddIndex(self,item):
47  
48         print 'Adding Index item URL %s'% item['title'].encode('utf-8')
49         self.conn.index({'title':item['title'].encode('utf-8'), \
50                 'url':item['url'].encode('utf-8'),\
相關文章
相關標籤/搜索