爬蟲程序主要是把第n層網頁的鏈接也下載下來
主程序
爬蟲啓動
生成一個隊列
f(x) 循環 隊列爲空跳出
網址出隊列
下載網頁 找下一層鏈接
添加到隊列html
from sys import argv from os import makedirs,unlink,sep from os.path import dirname,exists,isdir,splitext from string import replace ,find,lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse,urljoin from formatter import DumbWriter,AbstractFormatter from cStringIO import StringIO import os,sys syspath=sys.argv[0] class retri(object): def __init__(self,url): self.url=url self.file=self.filename(url) def filename(self,url,deffile='index.htm'): parsedurl=urlparse(url,'http:',0) if parsedurl[2]=='': path=parsedurl[1]+'//index.htm' else: path=parsedurl[1]+parsedurl[2] ext=splitext(path) if ext[1]=='': if path[-1]=='/': path+=deffile else: path+='/'+deffile ldir=dirname(path) # ldir=path if sep !='/': ldir =replace(ldir,'/',sep) if not isdir(ldir): if exists(ldir): unlink(ldir) makedirs(ldir) return path # return parsedurl[2] def download(self): try: retval=urlretrieve(self.url,self.file) return retval except IOError: retval=('*** error:invalid url "%s"'%self.url) return retval def parse_and_getlink(self): self.parser= (AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class crawler(object): count=0 def __init__(self,url): self.q=[url] self.seen=[] self.dom=urlparse(url)[1] def get_page(self,url): r=retri(url) retval=r.download() if retval[0]=='*': print retval,'.. skipping parse' return crawler.count+=1 print '\n(',crawler.count,')' print 'url:',url print 'file:',retval[0] self.seen.append(url) links=r.parse_and_getlink() for eachlink in links: if eachlink[:4]!='http' and find(eachlink,'://')==-1: eachlink=urljoin(url,eachlink) print '* ',eachlink if find(lower(eachlink),'mailto:')!=-1: print '... discarded,mailto link' continue if eachlink not in self.seen: if find(eachlink,self.dom)==-1: print '...discarded,not in domain' else: if eachlink not in self.q: self.q.append(eachlink) print '...new,added to q' else: print '...discarded,already in q' else: print '... discarded,already processed' def go(self): while self.q: url=self.q.pop() self.get_page(url) def main(): if len(argv)>1: url=argv[1] else: try: url=raw_input('enter starting url:') except(KeyboardInterrupt,EOFError): url='' if not url:return robot =crawler(url) robot.go() if __name__=='__main__': main()