#!/usr/bin/env python # -*- coding:utf-8 -*- ''' Created on 2013-3-21 @author: markGao ''' from sgmllib import SGMLParser import os import urllib import urllib2 import urlparse from Queue import Queue from threading import Thread save_path = 'D:\workspace\LearnPython\src\sample\image' passUrls = set() qimg = Queue() class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] self.imgs = [] def start_a(self, attrs): href = [ v for k, v in attrs if k == "href" and v.startswith("http")] if href: self.urls.extend(href) def start_img(self, attrs): src = [ v for k, v in attrs if k == "src" and v.startswith("http") ] if src: self.imgs.extend(src) def get_url_of_page(url, if_img=False): ''' 獲取一個頁面上的全部連接。 if_img:若是爲true,則獲取的是頁面上的全部圖片的連接 ''' urls = [] try: f = urllib2.urlopen(url, timeout=3).read() url_listen = URLLister() url_listen.feed(f) if if_img: urls.extend(url_listen.imgs) else: urls.extend(url_listen.urls) except urllib2.URLError, e: print e return urls def get_page_html(begin_url, depth, main_site_domain): ''' 遞歸處理頁面 ''' if depth <= 0: return print 'handle ' + begin_url passUrls.add(begin_url) #=========================================================================== # 讀取本頁面上的圖片 #=========================================================================== urls = get_url_of_page(begin_url, True) #=========================================================================== # 添加圖片到queue #=========================================================================== for murl in urls: firstindex = murl.find('?') if firstindex != -1: print firstindex murl = murl[:firstindex] print 'add img url:' + murl qimg.put(murl) #=============================================================================== # 讀取本頁面上的連接 #=============================================================================== urls = get_url_of_page(begin_url) if urls: for murl in urls: if not murl in passUrls: get_page_html(murl, depth - 1, main_site_domain) class DPThread(Thread): ''' 下載線程 ''' def run3(self): while True: filename = qimg.get() filename = filename.split("/")[-1] #dist = os.path.join(save_path, filename) dist = save_path + '/' + filename print dist print 'try connecting ' + filename if filename.endswith('jpg') or filename.endswith('png') or filename.endswith('gif') or filename.endswith('bmp') or filename.endswith('jpeg'): print ' downloading ' + filename dist = dist.replace('\\', '/') urllib.urlretrieve(filename, dist, None) print " Done: ", filename qimg.task_done() def run(self): while True: murl = qimg.get() print 'downloading ' + murl filename = murl.split("/")[-2] + "_" + murl.split("/")[-1] urlopen = urllib.URLopener() try: fp = urlopen.open(murl) data = fp.read() fp.close() f = open(save_path + "/" + filename, 'w+b') f.write(data) f.close() except IOError: print "download error!" + url qimg.task_done() if __name__ == "__main__": #=========================================================================== # 抓取圖片首個頁面 #=========================================================================== url = "http://www.yupoo.com/explore/interesting/7days/" #url='http://bringgooglereaderback.com/' #=========================================================================== # 圖片保存路徑 #=========================================================================== if not os.path.exists(save_path): os.mkdir(save_path) #=========================================================================== # 遍歷深度 #=========================================================================== max_depth = 1 main_site_domain = urlparse.urlsplit(url).netloc get_page_html(url, max_depth, main_site_domain) for i in range(1): t = DPThread() t.setDaemon(True) t.start() qimg.join() print 'end'
上面代碼中有一個run方法和一個run3方法。html
run3方法中使用urllib的urlretrieve來讀取圖片。發現這樣速度很慢。因此直接在run方法中用了urllib的URLopener來打開圖片地址並讀取數據直接寫到本地磁盤的文件中。python