python web m.sohu.com 掃描1

1.獲取http://m.sohu.com 全部的<a href=''...標籤 2.遞歸調用getlink函數獲取指定url中的全部<a 標籤 3.在url.open(ulr)中,咱們能夠try ...來抓取異常,找到異常頁面。html

抓到的異常頁面

輸入圖片說明

遞歸獲取一個頁面url:函數

coding:utf-8

from bs4 import BeautifulSoup import urllib import requestsurl

語法:string.startswith(str, beg=0, end=len(string))

或string[beg:end].startswith(str)

參數說明:

string: 被檢測的字符串

str: 指定的字符或者子字符串。(可使用元組,會逐一匹配)

beg: 設置字符串檢測的起始位置(可選)

end: 設置字符串檢測的結束位置(可選)

若是存在參數

beg

end,則在指定範圍內檢查,不然在整個字符串中檢查

返回值

若是檢測到字符串,則返回True,不然返回False。默認空字符爲True

函數解析:若是字符串string是以str開始,則返回True,不然返回False

#base url page0=set() #for http://xxxsohuxx page1=set() #for http://m.sohu.com/ page2=set()code

def basegetlink(): html = urllib.urlopen('http://m.sohu.com') bsobj = BeautifulSoup(html) # print bs linklist = bsobj.findAll('a')htm

for link in linklist:
	if 'href' in link.attrs:
		newpage = link.attrs['href']
		if newpage.startswith('http'):
			page1.add(link.attrs['href'])
		elif newpage.startswith('/'):
			page0.add(link.attrs['href'])

def checkExlink(url): if not url.startswith('http'): return False page1.add(url) print 'add a http: url to page1',url return True遞歸

def checkInlink(url): if not url.startswith('http://m.sohu.com'): return False page2.add(url) print 'add a http: url to page1', url圖片

def myurlopen(url): try: if not url.startswith('http'): url = 'http://m.sohu.com' + urlutf-8

html = urllib.urlopen(url)
except :
	html = None
	print('Error1..we want.......get err:', url)
	if url not in page2:
		page2.add(url)

return html

#遞歸獲取一個頁面url,並去重 def getInlink(url): global page0 # html = urllib.urlopen(url) html = myurlopen(url) if not html: return字符串

bsobj = BeautifulSoup(html)

print '*'*20
for link in bsobj.findAll('a'):
	if 'href' in link.attrs:
		newpage = link.attrs['href']
		try:
			if newpage[0] is '/' and newpage not in page0:
				page0.add(newpage)
				print('get a new inpage:', newpage, 'cur page0:', len(page0))
				getInlink(newpage)
			elif newpage.startswith('http') and newpage not in page1:
				page1.add(newpage)
				print('get a new http:', newpage, 'cur page1:', len(page1))
		except IndexError, e:
			print('Error0 we not care...:', e, 'url:', newpage)
			continue
print '+'*20

def showpage(page, flag=True): print '-'*10,len(page),'-'*10 if not flag: for i in page: print 'http://m.sohu.com'+i print '-' * 10, len(page), '-' * 10get

def show(*args): for page in args: showpage(page, True)

def getlinks(): tmplink = page0.pop() print tmplink

show(page0, page1)
getInlink('http://m.sohu.com'+tmplink)
show(page0, page1)

def main(): basegetlink() getlinks()

if name == 'main': main()

相關文章
相關標籤/搜索