福利,三俗,喜聞樂見 python
第三天,無阻塞版本的實現,在使用線程池的基礎上,添加了一些控制代碼,爲未來作控制檯作準備,另外,禁掉了控制檯的進度報告,速度飛快就上去了。 web
接下來要轉到文章的另外一部分,websocket上面去。 json
# -*- coding: utf8 -*- import concurrent.futures import urllib.request import re import json import ast import os visitedURL = {} maxpageID = [0] pageDict = {} def createPageID(url): pID = maxpageID.pop()+1 maxpageID.append(pID) pID = 'page_'+str(pID) visitedURL[url] = pID return pID maxpicID = [0] picDict = {} def createPicID(url): pID = maxpicID.pop()+1 maxpicID.append(pID) pID = 'pic_'+str(pID) visitedURL[url] = pID return pID stoppedQueue = [] waitingQueue = [] downloadingQueue = [] errorQueue = [] savedDict = dict() #for page downloading pageTpe = concurrent.futures.ThreadPoolExecutor(max_workers=8) #for picture downloading picTpe = concurrent.futures.ThreadPoolExecutor(max_workers=4) def runMachine(): #add at least 4 tasks to download totalpics = len(waitingQueue) futures = {} with picTpe as executor: while waitingQueue: while len(downloadingQueue)<4: if waitingQueue: picID = waitingQueue.pop(0) futures[(executor.submit(processload,picID))] = picID else: break if futures: for future in concurrent.futures.as_completed(futures): picID = futures[future] try: future.result() break except BaseException as e: downloadingQueue.remove(picID) print('error happens',e) errorQueue.append(picID) try: picInfo = picDict[picID] destructPicInfo(picID) os.remove(picInfo['filepath']) except OSError: pass print('total pic(s):{} loaded pic(s):{}'.format(totalpics,len(savedDict.keys()))) if errorQueue: print('files below are not downloaded properly:') for picID in errorQueue: print(picDict[picID]['url']) def destructPicInfo(picID): if picID in picDict: picInfo = picDict[picID] if 'outputfile' in picInfo: outputfile = picInfo['outputfile'] outputfile.close() del picInfo['outputfile'] def processload(picID): downloadingQueue.append(picID) #open conn,loading a picture picInfo = picDict[picID] url = picInfo['url'] filename = url.split('/')[-1] directory = 'pics/' filepath = directory+filename picInfo['filepath'] = filepath if not os.path.exists(directory): os.makedirs(directory) outputfile = open(filepath,'wb') picInfo['outputfile'] = outputfile picInfo['progress'] = 0 conn = urllib.request.urlopen(url,timeout=10) picInfo['total'] = int(conn.info().get('Content-Length').strip()) _type = conn.info().get('Content-Type') if _type.find('image')<0: destructPicInfo(picID) del picDict[picID] updateStatus(picInfo) while True: chunk = conn.read(4096) picInfo['progress']+=len(chunk) updateStatus(picInfo) if not chunk: picInfo['state'] = 2 downloadingQueue.remove(picID) savedDict[picID] = True updateStatus(picInfo) destructPicInfo(picID) conn.close() break outputfile.write(chunk) if picID not in downloadingQueue: if picID in stoppedQueue or picID in waitingQueue: picInfo['conn'] = conn break def updateStatus(picInfo): return url = picInfo['url'] if picInfo['state']==2: print(url,'finished!') elif picInfo['total'] and picInfo['progress']: print('{} progress: {:.2%}'.format(url,(picInfo['progress']/picInfo['total']))) pass def log(*args): f = open('t.txt','ba') f.write((','.join(map(str,args))+'\n').encode('utf-8')) f.close() def load_pic(url,pageID): if url in visitedURL: return picID = createPicID(url) #狀態:0,未開始,1,排隊待下,2,下載完畢 picDict[picID] = {'url':url,'pageID':pageID,'total':0,'progress':0,'state':1} waitingQueue.append(picID) def load_page(url): if url in visitedURL: return pID = createPageID(url) pageDict[pID] = {'url':url,'links':None} conn = urllib.request.urlopen(url) text = conn.readall().decode('GBK').encode('utf-8').decode('utf-8') conn.close() try: startIndex = text.index('<div class="mod newslist clear">') endIndex = text.index('<div class="mod curPosition clear">',startIndex) text = text[startIndex:endIndex] patt = re.compile('href="([^"]+?).htm"><img', re.DOTALL | re.IGNORECASE) jsurls = [x+'.hdBigPic.js' for x in patt.findall(text)] pageurllist = [] for jsurl in jsurls: if jsurl in visitedURL: continue jsID = createPageID(jsurl) pageDict[jsID] = {'url':jsurl,'links':None} jslinks = [] try: conn = urllib.request.urlopen(jsurl) except BaseException as e: print('failed') continue try: text = conn.readall().decode('GBK').encode('utf-8').decode('utf-8') text = text[:text.index('/* |xGv00|')] obj = ast.literal_eval(text) picnum = int(obj['Children'][0]['Children'][0]['Children'][0]['Content']) picsobj = obj['Children'][0]['Children'][1]['Children'] for x in picsobj: picurl = x['Children'][2]['Children'][0]['Content'] jslinks.append(picurl) if jslinks: pageDict[jsID]['links'] = jslinks print(jsurl,'{} pics'.format(len(jslinks))) try: title = obj['Children'][0]['Children'][8]['Children'][0]['Content'] except: title = 'unknown' pageurllist.append(jsurl) for picurl in jslinks: load_pic(picurl,jsID) except BaseException as e: print(jsurl,'failed') raise e pageDict[pID]['links'] = pageurllist except ValueError as e: print('error',e) #can't find proper place pass runMachine() urls = ['http://games.qq.com/l/photo/gmcos/yxcos.htm'] load_page(urls[0])