閒來無聊,查看了相關搜索引擎的基本知識,通過蒐集資料,瞭解了搜索引擎所須要的基本子系統,爬取子系統,索引服務子系統,Web請求和應答子系統。而後通過學習基本的開源框架文檔,集成的項目已經PUSH到GitHub。javascript
01 |
#!/usr/bin/env python |
02 |
#-*- coding:utf-8 -*- |
03 |
#from urlparse import urljoin |
04 |
from scrapy.utils.url import urljoin_rfc |
05 |
from scrapy.spider import BaseSpider |
06 |
from scrapy.selector import HtmlXPathSelector |
07 |
from scrapy.http import Request |
08 |
09 |
from scrapy.exceptions import DropItem |
10 |
11 |
from mymodules.items import Website |
12 |
13 |
import urllib |
14 |
import re |
15 |
16 |
class Xidian_Spider(BaseSpider): |
17 |
name = "xidian_spider" |
18 |
start_urls = [ |
19 |
"http://www.xidian.edu.cn" , |
21 |
22 |
] |
23 |
24 |
def __init__( self ): |
25 |
"""init the allowed_domain""" |
26 |
self .allowed_domains = [ 'xidian.edu.cn' ] |
27 |
28 |
def parse( self , response): |
29 |
"""In this parse,we use double yeild to return the item or Request""" |
30 |
hxs = HtmlXPathSelector(response) |
31 |
32 |
refer_websites = hxs.select( '//@href' ).extract() |
33 |
34 |
#if not self.gethostname(response.url) in self.allowed_domains: |
35 |
# self.allowed_domains.append(self.gethostname(response.url)) |
36 |
37 |
item = Website() |
38 |
item[ 'url' ] = response.url |
39 |
item[ 'title' ] = hxs.select( '/html/head/title/text()' ).extract()[ 0 ] |
40 |
41 |
"""FIXME:This XPath select all the elements,include the javascript code.BAD!!""" |
42 |
str = '' |
43 |
list = hxs.select( '/html/body//*/text()' ).extract() |
44 |
for s in list : |
45 |
str + = s.strip() |
46 |
str + = ' ' |
47 |
48 |
item[ 'content' ] = str |
49 |
50 |
yield item |
51 |
52 |
for weburl in refer_websites: |
53 |
54 |
utf8_url = weburl.encode( 'utf-8' ) |
55 |
56 |
"""The following regex to match the prefix and postfix of urls""" |
57 |
postfix = re. compile (r '.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$' ) |
58 |
prefix = re. compile (r '^((javascript:)|(openapi)).+' ) |
59 |
60 |
if postfix.match(utf8_url): |
61 |
continue |
62 |
if prefix.match(utf8_url): |
63 |
continue |
64 |
if not utf8_url.startswith( 'http://' ): |
65 |
#weburl = urljoin_rfc(response.url, weburl, response.encoding) |
66 |
weburl = 'http://' + self .gethostname(response.url) + '/' + weburl |
67 |
68 |
weburl = re.sub(r '/\.\./\.\./' ,r '/' ,weburl) |
69 |
weburl = re.sub(r '/\.\./' ,r '/' ,weburl) |
70 |
71 |
yield Request(weburl, callback = self .parse) |
72 |
73 |
def gethostname( self , res_url): |
74 |
"""get the host name of a url""" |
75 |
proto, rest = urllib.splittype(res_url) |
76 |
host, rest = urllib.splithost(rest) |
77 |
return host |
01 |
class DuplicatesPipeline( object ): |
02 |
03 |
def __init__( self ): |
04 |
self .bf = BloomFilter( 10000000 , 0.01 , 'filter.bloom' ) |
05 |
self .f_write = open ( 'visitedsites' , 'w' ) |
06 |
self .si = SearchIndex() |
07 |
self .si.SearchInit() |
08 |
09 |
def process_item( self , item, spider): |
10 |
print '************%d pages visited!*****************' % len ( self .bf) |
11 |
if self .bf.add(item[ 'url' ]): #True if item in the BF |
12 |
raise DropItem( "Duplicate item found: %s" % item) |
13 |
else : |
14 |
#print '%d pages visited!'% len(self.url_seen) |
15 |
self .save_to_file(item[ 'url' ],item[ 'title' ]) |
16 |
self .si.AddIndex(item) |
17 |
return item |
18 |
19 |
def save_to_file( self ,url,utitle): |
20 |
self .f_write.write(url) |
21 |
self .f_write.write( '\t' ) |
22 |
self .f_write.write(utitle.encode( 'utf-8' )) |
23 |
self .f_write.write( '\n' ) |
24 |
25 |
def __del__( self ): |
26 |
"""docstring for __del__""" |
27 |
self .f_write.close() |
28 |
self .si.IndexDone() |
01 |
#!/usr/bin/env python |
02 |
#-*- coding:utf-8-*- |
03 |
import os |
04 |
import sys |
05 |
from pyes import * |
06 |
from mymodules.items import Website |
07 |
INDEX_NAME = 'xidian_spider' |
08 |
09 |
class SearchIndex( object ): |
10 |
11 |
def SearchInit( self ): |
12 |
self .conn = ES( '127.0.0.1:9200' , timeout = 3.5 ) #Connect to ES |
13 |
try : |
14 |
self .conn.delete_index(INDEX_NAME) |
15 |
#pass |
16 |
except : |
17 |
pass |
18 |
self .conn.create_index(INDEX_NAME) #Create a new INDEX |
19 |
20 |
#Define the structure of the data format |
21 |
mapping = {u 'content' : { 'boost' : 1.0 , |
22 |
'index' : 'analyzed' , |
23 |
'store' : 'yes' , |
24 |
'type' : u 'string' , |
25 |
"indexAnalyzer" : "ik" , |
26 |
"searchAnalyzer" : "ik" , |
27 |
"term_vector" : "with_positions_offsets" }, |
28 |
u 'title' : { 'boost' : 1.0 , |
29 |
'index' : 'analyzed' , |
30 |
'store' : 'yes' , |
31 |
'type' : u 'string' , |
32 |
"indexAnalyzer" : "ik" , |
33 |
"searchAnalyzer" : "ik" , |
34 |
"term_vector" : "with_positions_offsets" }, |
35 |
u 'url' : { 'boost' : 1.0 , |
36 |
'index' : 'analyzed' , |
37 |
'store' : 'yes' , |
38 |
'type' : u 'string' , |
39 |
#"indexAnalyzer":"ik", |
40 |
#"searchAnalyzer":"ik", |
41 |
"term_vector" : "with_positions_offsets" }, |
42 |
} |
43 |
44 |
self .conn.put_mapping( "searchEngine-type" , { 'properties' :mapping}, [INDEX_NAME]) #Define the type |
45 |
46 |
def AddIndex( self ,item): |
47 |
48 |
print 'Adding Index item URL %s' % item[ 'title' ].encode( 'utf-8' ) |
49 |
self .conn.index({ 'title' :item[ 'title' ].encode( 'utf-8' ), \ |
50 |
'url' :item[ 'url' ].encode( 'utf-8' ),\ |