貼吧的老歷史想存下來,強行python爬取一波html
隊友太強,躺好別動,偶爾作點副業python
僞裝會python正則表達式
基本流程:post
1.爬取頁面,獲取頁面的html源碼url
2.分析源碼 經過正則表達式 匹配到想要的內容spa
3.去掉不須要的code
python中 re模塊 提供對正則表達式的支持htm
1 # -*- coding:utf-8 -*- 2 import urllib2 3 import urllib 4 import re 5 class Tool: 6 #去除img標籤,7位長空格 7 removeImg = re.compile('<img.*?>| {7}|') 8 #刪除超連接標籤 9 removeAddr = re.compile('<a.*?>|</a>') 10 #把換行的標籤換爲\n 11 replaceLine = re.compile('<tr>|<div>|</div>|</p>') 12 #將表格製表<td>替換爲\t 13 replaceTD= re.compile('<td>') 14 #把段落開頭換爲\n加空兩格 15 replacePara = re.compile('<p.*?>') 16 #將換行符或雙換行符替換爲\n 17 replaceBR = re.compile('<br><br>|<br>') 18 #將其他標籤剔除 19 removeExtraTag = re.compile('<.*?>') 20 def replace(self,x): 21 x = re.sub(self.removeImg,"",x) 22 x = re.sub(self.removeAddr,"",x) 23 x = re.sub(self.replaceLine,"\n",x) 24 x = re.sub(self.replaceTD,"\t",x) 25 x = re.sub(self.replacePara,"\n ",x) 26 x = re.sub(self.replaceBR,"\n",x) 27 x = re.sub(self.removeExtraTag,"",x) 28 #strip()將先後多餘內容刪除 29 return x.strip() 30 class BDTB: 31 32 def __init__(self,baseUrl,seeLZ): 33 self.baseURL =baseUrl 34 self.seeLZ ='?see_lz'+str(seeLZ) 35 self.tool=Tool() 36 def getPage(self,pageNum):#抓取網頁 第幾頁 37 try: 38 url=self.baseURL+self.seeLZ+'&pn='+str(pageNum) 39 request = urllib2.Request(url) 40 response =urllib2.urlopen(request) 41 #print response.read() 42 return response.read().decode('utf-8') 43 except urllib2.URLError,e: 44 if hasattr(e,"reason"): 45 print u"fail",e.reason 46 return None 47 def getTitle(self):#獲取標題 48 page =self.getPage(1) 49 pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S) 50 result =re.search(pattern,page) 51 #print result 52 if result: 53 print result.group(1) 54 return result.group(1).strip() 55 else : 56 return None 57 def getContent(self):#獲取正文 58 page=self.getPage(1) 59 pattern =re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) 60 items =re.findall(pattern,page) 61 for i in range(1,20):#前19個 62 print self.tool.replace(items[i]) 63 64 baseURL = 'http://tieba.baidu.com/p/5285925491'#任意一個網頁的url 65 bdtb=BDTB(baseURL,1) 66 #bdtb.getTitle() 67 bdtb.getContent()