前些天,學習python多線程,寫了個爬蟲,能夠爬取虎嗅上的一些文章。開始時是想爬完文章後,再搞個分類器將文章分類展現,工做較忙,又要學習新的知識,分類器之後再搞了,備忘。html
1 # !/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import Queue 5 import threading 6 import time 7 import re 8 from BeautifulSoup import BeautifulSoup 9 from Dbpool import DBTools 10 import urllib2 11 import logging 12 import sys 13 reload(sys) 14 sys.setdefaultencoding('utf-8') 15 16 sBaseUrl = 'http://www.huxiu.com/focus/' 17 sMainUrl = 'http://www.huxiu.com' 18 pool = DBTools('localhost','root','1q2w3e4r','selfblog') 19 logger = logging.getLogger(__name__) 20 class WorkManager(object): 21 def __init__(self, work_num=1000,thread_num=6): 22 self.work_queue = Queue.Queue() 23 self.threads = [] 24 self.__init_work_queue(work_num) 25 self.__init_thread_pool(thread_num) 26 27 #初始化線程 28 def __init_thread_pool(self,thread_num): 29 for i in range(thread_num): 30 self.threads.append(Spider(self.work_queue)) 31 32 #初始化工做隊列 33 def __init_work_queue(self, jobs_num): 34 for i in range(jobs_num): 35 page_url = '%s%s%s' %(sBaseUrl,i,'.html') 36 self.add_job(page_url) 37 38 #添加一項工做入隊 39 def add_job(self,args): 40 self.work_queue.put(args)#任務入隊,Queue內部實現了同步機制 41 42 #檢查剩餘隊列任務 43 def check_queue(self): 44 return self.work_queue.qsize() 45 46 #等待全部線程運行完畢 47 def wait_allcomplete(self): 48 for item in self.threads: 49 if item.isAlive():item.join() 50 51 class Spider(threading.Thread): 52 def __init__(self, work_queue): 53 threading.Thread.__init__(self) 54 self.work_queue = work_queue 55 self.start() 56 57 def run(self): 58 #死循環,從而讓建立的線程在必定條件下關閉退出 59 while True: 60 try: 61 page_url = self.work_queue.get(block=False)#任務異步出隊,Queue內部實現了同步機制 62 print '線程:[%s],url:%s' % (threading.local,page_url) 63 logger.info(u'線程:[%s],url:%s' % (threading.local,page_url)) 64 self.doAnalize(page_url) 65 self.work_queue.task_done()#通知系統任務完成 66 except Exception,e: 67 print str(e) 68 break 69 def doAnalize(self,page_url): 70 try: 71 content = urllib2.urlopen(page_url).read() 72 except IOError,e: 73 print(e) 74 75 #匹配每頁中的文章連接 76 re_url = re.compile(r'(/article/[0-9]{2,5}/1.html)') 77 urls = re_url.findall(content) 78 for aurl in set(urls): 79 news_url = '%s%s' %(sMainUrl,aurl) 80 #print '--',news_url 81 try: 82 #解析獲取的文章(html)並將‘轉換成「 便於拼接sql 83 news_content = urllib2.urlopen(news_url).read().replace("\'","\"") 84 #使用BeautifulSoup解析html 獲取文章title 85 soup = BeautifulSoup(news_content) 86 title = soup.findAll("h1")[1].text.strip() 87 88 sql = 'INSERT INTO web_news(title,content_html, ref_url) VALUES (\'%s\', \'%s\',\'%s\')' %( title,news_content,news_url) 89 pool.operateDB('insert',sql) 90 except IOError: 91 continue 92 93 if __name__ == '__main__': 94 start = time.time() 95 work_manager = WorkManager(10, 5) 96 work_manager.wait_allcomplete() 97 end = time.time() 98 print "cost all time: %s" % (end-start)