目前計算機程序通常會遇到兩類I/O:硬盤I/O和網絡I/O。我就針對網絡I/O的場景分析下python3下進程、線程、協程效率的對比。進程採用multiprocessing.Pool進程池,線程是本身封裝的進程池,協程採用gevent的庫。用python3自帶的urlllib.request和開源的requests作對比。代碼以下:python
- import urllib.request
- import requests
- import time
- import multiprocessing
- import threading
- import queue
-
- def startTimer():
- return time.time()
-
- def ticT(startTime):
- useTime = time.time() - startTime
- return round(useTime, 3)
-
-
- def download_urllib(url):
- req = urllib.request.Request(url,
- headers={'user-agent': 'Mozilla/5.0'})
- res = urllib.request.urlopen(req)
- data = res.read()
- try:
- data = data.decode('gbk')
- except UnicodeDecodeError:
- data = data.decode('utf8', 'ignore')
- return res.status, data
-
- def download_requests(url):
- req = requests.get(url,
- headers={'user-agent': 'Mozilla/5.0'})
- return req.status_code, req.text
-
- class threadPoolManager:
- def __init__(self,urls, workNum=10000,threadNum=20):
- self.workQueue=queue.Queue()
- self.threadPool=[]
- self.__initWorkQueue(urls)
- self.__initThreadPool(threadNum)
-
- def __initWorkQueue(self,urls):
- for i in urls:
- self.workQueue.put((download_requests,i))
-
- def __initThreadPool(self,threadNum):
- for i in range(threadNum):
- self.threadPool.append(work(self.workQueue))
-
- def waitAllComplete(self):
- for i in self.threadPool:
- if i.isAlive():
- i.join()
-
- class work(threading.Thread):
- def __init__(self,workQueue):
- threading.Thread.__init__(self)
- self.workQueue=workQueue
- self.start()
- def run(self):
- while True:
- if self.workQueue.qsize():
- do,args=self.workQueue.get(block=False)
- do(args)
- self.workQueue.task_done()
- else:
- break
-
- urls = ['http://www.ustchacker.com'] * 10
- urllibL = []
- requestsL = []
- multiPool = []
- threadPool = []
- N = 20
- PoolNum = 100
-
- for i in range(N):
- print('start %d try' % i)
- urllibT = startTimer()
- jobs = [download_urllib(url) for url in urls]
-
-
-
- urllibL.append(ticT(urllibT))
- print('1')
-
- requestsT = startTimer()
- jobs = [download_requests(url) for url in urls]
-
-
-
- requestsL.append(ticT(requestsT))
- print('2')
-
- requestsT = startTimer()
- pool = multiprocessing.Pool(PoolNum)
- data = pool.map(download_requests, urls)
- pool.close()
- pool.join()
- multiPool.append(ticT(requestsT))
- print('3')
-
- requestsT = startTimer()
- pool = threadPoolManager(urls, threadNum=PoolNum)
- pool.waitAllComplete()
- threadPool.append(ticT(requestsT))
- print('4')
-
- import matplotlib.pyplot as plt
- x = list(range(1, N+1))
- plt.plot(x, urllibL, label='urllib')
- plt.plot(x, requestsL, label='requests')
- plt.plot(x, multiPool, label='requests MultiPool')
- plt.plot(x, threadPool, label='requests threadPool')
- plt.xlabel('test number')
- plt.ylabel('time(s)')
- plt.legend()
- plt.show()
運行結果以下:
從上圖能夠看出,python3自帶的urllib.request效率仍是不如開源的requests,multiprocessing進程池效率明顯提高,但還低於本身封裝的線程池,有一部分緣由是建立、調度進程的開銷比建立線程高(測試程序中我把建立的代價也包括在裏面)。網絡
下面是gevent的測試代碼:app
- import urllib.request
- import requests
- import time
- import gevent.pool
- import gevent.monkey
-
- gevent.monkey.patch_all()
-
- def startTimer():
- return time.time()
-
- def ticT(startTime):
- useTime = time.time() - startTime
- return round(useTime, 3)
-
-
- def download_urllib(url):
- req = urllib.request.Request(url,
- headers={'user-agent': 'Mozilla/5.0'})
- res = urllib.request.urlopen(req)
- data = res.read()
- try:
- data = data.decode('gbk')
- except UnicodeDecodeError:
- data = data.decode('utf8', 'ignore')
- return res.status, data
-
- def download_requests(url):
- req = requests.get(url,
- headers={'user-agent': 'Mozilla/5.0'})
- return req.status_code, req.text
-
- urls = ['http://www.ustchacker.com'] * 10
- urllibL = []
- requestsL = []
- reqPool = []
- reqSpawn = []
- N = 20
- PoolNum = 100
-
- for i in range(N):
- print('start %d try' % i)
- urllibT = startTimer()
- jobs = [download_urllib(url) for url in urls]
-
-
-
- urllibL.append(ticT(urllibT))
- print('1')
-
- requestsT = startTimer()
- jobs = [download_requests(url) for url in urls]
-
-
-
- requestsL.append(ticT(requestsT))
- print('2')
-
- requestsT = startTimer()
- pool = gevent.pool.Pool(PoolNum)
- data = pool.map(download_requests, urls)
-
-
-
- reqPool.append(ticT(requestsT))
- print('3')
-
- requestsT = startTimer()
- jobs = [gevent.spawn(download_requests, url) for url in urls]
- gevent.joinall(jobs)
-
-
-
- reqSpawn.append(ticT(requestsT))
- print('4')
-
- import matplotlib.pyplot as plt
- x = list(range(1, N+1))
- plt.plot(x, urllibL, label='urllib')
- plt.plot(x, requestsL, label='requests')
- plt.plot(x, reqPool, label='requests geventPool')
- plt.plot(x, reqSpawn, label='requests Spawn')
- plt.xlabel('test number')
- plt.ylabel('time(s)')
- plt.legend()
- plt.show()
運行結果以下:
從上圖能夠看到,對於I/O密集型任務,gevent仍是能對性能作很大提高的,因爲協程的建立、調度開銷都比線程小的多,因此能夠看到不論使用gevent的Spawn模式仍是Pool模式,性能差距不大。socket
由於在gevent中須要使用monkey補丁,會提升gevent的性能,但會影響multiprocessing的運行,若是要同時使用,須要以下代碼:post
- gevent.monkey.patch_all(thread=False, socket=False, select=False)
但是這樣就不能充分發揮gevent的優點,因此不能把multiprocessing Pool、threading Pool、gevent Pool在一個程序中對比。不過比較兩圖能夠得出結論,線程池和gevent的性能最優的,其次是進程池。附帶得出個結論,requests庫比urllib.request庫性能要好一些哈:-) 性能
轉載請註明:轉自http://blog.csdn.net/littlethunder/article/details/40983031測試