python爬蟲

前些天,學習python多線程,寫了個爬蟲,能夠爬取虎嗅上的一些文章。開始時是想爬完文章後,再搞個分類器將文章分類展現,工做較忙,又要學習新的知識,分類器之後再搞了,備忘。html

 1 # !/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import Queue
 5 import threading
 6 import time
 7 import re
 8 from BeautifulSoup import BeautifulSoup
 9 from Dbpool import DBTools
10 import urllib2
11 import logging
12 import sys
13 reload(sys)
14 sys.setdefaultencoding('utf-8')
15 
16 sBaseUrl = 'http://www.huxiu.com/focus/'
17 sMainUrl = 'http://www.huxiu.com'
18 pool = DBTools('localhost','root','1q2w3e4r','selfblog')
19 logger = logging.getLogger(__name__)
20 class WorkManager(object):
21     def __init__(self, work_num=1000,thread_num=6):
22         self.work_queue = Queue.Queue()
23         self.threads = []
24         self.__init_work_queue(work_num)
25         self.__init_thread_pool(thread_num)
26 
27     #初始化線程
28     def __init_thread_pool(self,thread_num):
29         for i in range(thread_num):
30             self.threads.append(Spider(self.work_queue))
31 
32     #初始化工做隊列
33     def __init_work_queue(self, jobs_num):
34         for i in range(jobs_num):
35             page_url = '%s%s%s' %(sBaseUrl,i,'.html')
36             self.add_job(page_url)
37 
38     #添加一項工做入隊
39     def add_job(self,args):
40         self.work_queue.put(args)#任務入隊,Queue內部實現了同步機制
41 
42     #檢查剩餘隊列任務
43     def check_queue(self):
44         return self.work_queue.qsize()
45 
46     #等待全部線程運行完畢
47     def wait_allcomplete(self):
48         for item in self.threads:
49             if item.isAlive():item.join()
50 
51 class Spider(threading.Thread):
52     def __init__(self, work_queue):
53         threading.Thread.__init__(self)
54         self.work_queue = work_queue
55         self.start()
56 
57     def run(self):
58         #死循環,從而讓建立的線程在必定條件下關閉退出
59         while True:
60             try:
61                 page_url = self.work_queue.get(block=False)#任務異步出隊,Queue內部實現了同步機制
62                 print '線程:[%s],url:%s' % (threading.local,page_url)
63                 logger.info(u'線程:[%s],url:%s' % (threading.local,page_url))
64                 self.doAnalize(page_url)
65                 self.work_queue.task_done()#通知系統任務完成
66             except Exception,e:
67                 print str(e)
68                 break
69     def doAnalize(self,page_url):
70         try:
71             content = urllib2.urlopen(page_url).read()
72         except IOError,e:
73             print(e)
74 
75         #匹配每頁中的文章連接
76         re_url = re.compile(r'(/article/[0-9]{2,5}/1.html)')
77         urls = re_url.findall(content)
78         for aurl in set(urls):
79             news_url = '%s%s' %(sMainUrl,aurl)
80             #print '--',news_url
81             try:
82                 #解析獲取的文章(html)並將‘轉換成「 便於拼接sql
83                 news_content = urllib2.urlopen(news_url).read().replace("\'","\"")
84                 #使用BeautifulSoup解析html 獲取文章title
85                 soup = BeautifulSoup(news_content)
86                 title = soup.findAll("h1")[1].text.strip()
87 
88                 sql = 'INSERT INTO web_news(title,content_html, ref_url) VALUES (\'%s\', \'%s\',\'%s\')' %( title,news_content,news_url)
89                 pool.operateDB('insert',sql)
90             except IOError:
91                 continue
92 
93 if __name__ == '__main__':
94     start = time.time()
95     work_manager =  WorkManager(10, 5)
96     work_manager.wait_allcomplete()
97     end = time.time()
98     print "cost all time: %s" % (end-start)
相關文章
相關標籤/搜索