''' # 代碼範本 任務添加函數、任務執行函數;進程、線程切換函數;進、線程開啓函數; ''' from multiprocessing import Pool as processPoll from multiprocessing.dummy import Pool as ThreadPool def get_page(): # 任務執行 pass def url_list(): # 任務添加 pass def get_pool(): # 設定進、線程 pass def open_pool(): # 啓動 pass if __name__ == '__main__': open_pool()
使用16線程爬取騰訊的招聘的100頁分頁信息,用時6秒左右(3M網速)php
''' 任務添加函數、任務執行函數;進程、線程切換函數;進、線程開啓函數; ''' import requests from urllib import request import ssl ssl._create_default_https_context = ssl._create_unverified_context from datetime import datetime from multiprocessing import Pool as ProcessPoll from multiprocessing.dummy import Pool as ThreadPool def get_page(task_q): # 任務執行 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} req = request.Request(task_q,headers=headers) response = request.urlopen(req) print(response.url) # response = requests.get(task_q,headers=headers,verify=False) # print(response.url,response.status_code) def url_list(): # 任務添加 task_q = [] base_url = 'http://hr.tencent.com/position.php?start={}' for i in range(0,10*100,10): full_url = base_url.format(i) task_q.append(full_url) return task_q def get_pool(way=True,count=4): # 設定進、線程 if way: pool = ProcessPoll(count) # 進程 else: pool = ThreadPool(count) # 線程 return pool def open_pool(): # 啓動 start = datetime.now() pool = get_pool(way=False,count=16) task_q = url_list() pool.map(get_page,task_q) pool.close() pool.join() end = datetime.now() print('程序結束,用時',end-start) if __name__ == '__main__': open_pool()