封裝代碼,代碼重用python
# -*- coding: UTF-8 -*- # procedures過程 def get_next_target(page): start_link = page.find('<a href=') if start_link == -1: # not found return None,0 start_quote = page.find('"',start_link) end_quote = page.find('"',start_quote+1) url = page[start_quote+1:end_quote] return url,end_quote # 循環 def print_all_links(page): while True: url,endpos = get_next_target(page) if url: print url page = page[endpos:] else: break # 獲取網頁源代碼 def get_page(url): try: import urllib return urllib.urlopen(url).read() except: return '' # print_all_links('this <a href="test1">link 1</a> is <a href="test2"link 2</a> a <a href="test3">link3</a>') # >>>test1 # >>>test2 # >>>test3 # content = get_page('http://xkcd.com/353/') # print_all_links(content) print_all_links(get_page('http://xkcd.com/353/')) #print_all_links(get_page('https://www.baidu.com/'))