1.獲取http://m.sohu.com 全部的<a href=''...標籤 2.遞歸調用getlink函數獲取指定url中的全部<a 標籤 3.在url.open(ulr)中,咱們能夠try ...來抓取異常,找到異常頁面。html
from bs4 import BeautifulSoup import urllib import requestsurl
#base url page0=set() #for http://xxxsohuxx page1=set() #for http://m.sohu.com/ page2=set()code
def basegetlink(): html = urllib.urlopen('http://m.sohu.com') bsobj = BeautifulSoup(html) # print bs linklist = bsobj.findAll('a')htm
for link in linklist: if 'href' in link.attrs: newpage = link.attrs['href'] if newpage.startswith('http'): page1.add(link.attrs['href']) elif newpage.startswith('/'): page0.add(link.attrs['href'])
def checkExlink(url): if not url.startswith('http'): return False page1.add(url) print 'add a http: url to page1',url return True遞歸
def checkInlink(url): if not url.startswith('http://m.sohu.com'): return False page2.add(url) print 'add a http: url to page1', url圖片
def myurlopen(url): try: if not url.startswith('http'): url = 'http://m.sohu.com' + urlutf-8
html = urllib.urlopen(url) except : html = None print('Error1..we want.......get err:', url) if url not in page2: page2.add(url) return html
#遞歸獲取一個頁面url,並去重 def getInlink(url): global page0 # html = urllib.urlopen(url) html = myurlopen(url) if not html: return字符串
bsobj = BeautifulSoup(html) print '*'*20 for link in bsobj.findAll('a'): if 'href' in link.attrs: newpage = link.attrs['href'] try: if newpage[0] is '/' and newpage not in page0: page0.add(newpage) print('get a new inpage:', newpage, 'cur page0:', len(page0)) getInlink(newpage) elif newpage.startswith('http') and newpage not in page1: page1.add(newpage) print('get a new http:', newpage, 'cur page1:', len(page1)) except IndexError, e: print('Error0 we not care...:', e, 'url:', newpage) continue print '+'*20
def showpage(page, flag=True): print '-'*10,len(page),'-'*10 if not flag: for i in page: print 'http://m.sohu.com'+i print '-' * 10, len(page), '-' * 10get
def show(*args): for page in args: showpage(page, True)
def getlinks(): tmplink = page0.pop() print tmplink
show(page0, page1) getInlink('http://m.sohu.com'+tmplink) show(page0, page1)
def main(): basegetlink() getlinks()
if name == 'main': main()