單線程、多線程、多進程、協程比較,以爬取新浪軍事歷史爲例

演示python單線程、多線程、多進程、協程python

  1 import requests,json,random
  2 import re,threading,time
  3 from lxml import etree
  4 
  5 lock=threading.Lock()
  6 semaphore=threading.Semaphore(100)   ###每次限制只能100線程
  7 
  8 user_agent_list = [ \
  9         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,\
 10         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
 11         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
 12         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
 13         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
 14         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
 15         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
 16         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
 17         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
 18         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
 19         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
 20         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
 21         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
 22         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
 23         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
 24         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
 25         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
 26         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
 27     ]
 28 count=0
 29 
 30 def sina(page_url):    ##列表頁
 31     if semaphore.acquire():
 32         header={}
 33 
 34         header['User-Agent']=random.choice(user_agent_list)
 35         header.update({
 36             "Host":"platform.sina.com.cn",
 37 
 38             #"Cookie":"global_cookie=fb1g6d0w64d2cmu86sv4g9n3va0j137sk48; vh_newhouse=3_1491312022_2816%5B%3A%7C%40%7C%3A%5D833300ee3177d88529c7aa418942ece9; newhouse_user_guid=2F163DE7-8201-7FA9-2FB6-E507FE6F03B1; SoufunSessionID_Esf=3_1495389730_232; sf_source=; s=; showAdsh=1; hlist_xfadhq_SZ=0%7c2017%2f5%2f25+1%3a21%3a47%7c; city=sz; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; logGuid=a768dd46-b85b-47f4-a7a0-0a6596cab4cd; __utma=147393320.1111837171.1491290389.1495646208.1495650134.9; __utmb=147393320.12.10.1495650134; __utmc=147393320; __utmz=147393320.1495650134.9.4.utmcsr=esf.sz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; unique_cookie=U_cqyov4ut5vv1al8e2858qhzgt17j2z06mph*14"
 39             })
 40         while(1):
 41             content=''
 42             try:
 43                 content=requests.get(page_url,headers=header,timeout=5).content
 44 
 45             except Exception as e:
 46                 print e
 47             if content!='':
 48                 break
 49 
 50 
 51 
 52 
 53         jsona=re.findall('jQuery191012358189839869738_1495880348059\(([\s\S]*?"}]}})',content)[0]
 54         #print jsona
 55         dict= json.loads(jsona)
 56         #print type(dict)
 57         #print dict
 58         #print dict['result']['data']
 59         for l in dict['result']['data']:
 60             title= l['title']
 61             url= l['url']
 62             biaoqian=get_biaoqian(url)
 63 
 64             lock.acquire()
 65             global count
 66             count+=1
 67             print time.strftime('%H:%M:%S',time.localtime(time.time())),'    ',count
 68             print '列表頁:'
 69              70             print ' title: %s\n url: %s'%(title,url)
 71 
 72             print '詳情頁:'
 73             print ' biaoqian: %s \n'%(biaoqian)
 74             print '**************************************************************'
 75             lock.release()
 76 
 77         semaphore.release()
 78 
 79 
 80 
 81 def get_biaoqian(url):    ###新聞頁,爬取標籤
 82 
 83     header={'User-Agent':random.choice(user_agent_list)}
 84     header.update({"Host":"mil.news.sina.com.cn"})
 85 
 86     while(1):
 87         content=''
 88         try:
 89             content=requests.get(url,headers=header,timeout=10).content
 90         except Exception as  e:
 91             #print e
 92             pass
 93         if content!='':
 94             break
 95 
 96 
 97     se=etree.HTML(content)
 98     #print etree.tounicode(se)
 99     biaoqian=se.xpath('//p[@class="art_keywords"]/a/text()')
100     return  ' '.join(biaoqian)
101 
102 
103 
104 
105 def singe_req():
106     for i in range(1,301):
107         page_url='http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i
108         sina(page_url)
109     print 'over'
110 
111 def threading_red():
112     threads=[]
113     for i in range(1,301):
114         t=threading.Thread(target=sina,args=('http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i,))
115         threads.append(t)
116         t.start()
117     for t in threads:
118         t.join()
119     print 'over'
120 
121 def  muiltiprocessing_req():
122     import multiprocessing
123     pool = multiprocessing.Pool(100)
124     #pool = multiprocessing.Pool(multiprocessing.cpu_count())
125 
126     pool.map(sina, ['http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i for i in range(1,301)])
127     pool.close()
128     pool.join()
129     print 'over'
130 
131 def gevent_req():
132     ######################利用pool######################
133     from gevent import monkey
134     from gevent.pool import Pool
135 
136     monkey.patch_all()
137     pool = Pool(100)
138     data= pool.map(sina, ['http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i for i in range(1,301)])
139     print 'over'
140 
141 if __name__=='__main__':
142     pass
143     singe_req()                     ##單線程
144     #threading_red()                  ###多線程
145     #muiltiprocessing_req()             ####多進程
146 #gevent_req() ##協程


這篇主要是用四種方法來實現爬蟲。不管是100線程仍是100進程或者100協程,網速都撐滿了,爬取速度很快,單線程對網速利用很不充分,固然就爬取緩慢。面試

 

 

 

特別是我以前在面試房極客時候,那主管告訴我,他說他看了網上說python多線程是假的,因此他歷來就沒使用過多線程,只用多進程,他認爲多線程不能加快爬蟲速度。數據庫

關於這一點我是很是肯定python多線程能加快爬取速度的,由於我使用多線程的時間很長,那主管應該只看了一半,python對cpu密集型速度提高不了多少,但對於io密集型的速度提高是立竿見影的,特別是對timeout比較大的網站,多線程爬取優點很是明顯,由於爬蟲是打開頁面,請求服務器後端,服務器後端操做數據庫查詢數據,數據庫返回給後端返回給前段,這種屬於io密集型,多線程在爬蟲和性能測試都是能夠的。而多進程實在是開銷太大了,開100進程,任務管理器能夠看到100個python.exe,每一個佔用20M內存,多進程啓動時候佔用cpu極高。爬蟲是很是適合多線程的,或者利用協程也能夠。json

 

發下運行結果:後端

相關文章
相關標籤/搜索