python 第二例,爬取百度貼吧的帖子,獲取帖子的標題,內容,所在樓層,發佈時間python
其中存在一個問題,當該帖子是手機端發佈的帖子,此時在頁面中會有標識,所以多一個span標籤,與樓層和發佈時間的標籤同樣app
解決方法: 目潛想到的解決方法是經過判斷爬到的值來進行選擇,但解決方案效率確定低,所以未使用,等知識體系豐富後再進行改進post
附爬取的代碼:編碼
# -*- coding: utf-8 -*- import urllib2 import urllib import re class Tool: # 去除Img標籤 removeImg = re.compile(r'<img.*?>| {0,100}|') # 刪除超連接標籤 removeAddr = re.compile(r'<a.*?>|</a>') # 把換行標籤轉換爲\n replaceLine= re.compile(r'<tr>|<div>|</div>|</p>') # 把表格製表轉換爲\t replaceTD = re.compile(r'<td>') # 把段落開頭轉換爲\n加兩個空格 replacePara = re.compile('<br><br>|<br>|<br><br><br>') # 將其他標籤去除 removeExtraTag = re.compile('<.*?>') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.removeExtraTag, "", x) # 使用strip() 方法將先後多餘內容刪除 return x.strip() class BDTB: #初始化變量,傳入基類地址,傳入是否只看樓主參數 def __init__(self, baseUrl, onlyLz, floorTag): # 基地址 self.baseUrl = baseUrl # 是否只看樓主 self.onlyLz = '?see_lz=' + str(onlyLz) # 實例化替換去除標籤類 self.tool = Tool() # 全局file變量,文件寫入操做對象 self.file = None # 默認標籤,若沒有得到標題,使用此標題 self.defaultTitle = u"百度貼吧帖子" # 是否寫入樓層信息 self.floorTag = floorTag # 傳入頁碼,獲取該帖子的代碼 def getPage(self, pageNum): try: url = self.baseUrl + self.onlyLz + '&pn=' + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode('utf-8') except urllib2.URLError,e: if hasattr(e, 'code'): print u"鏈接百度貼吧失敗,錯誤編碼:" + e.code return None if hasattr(e, 'reason'): print u"鏈接百度貼吧失敗,錯誤緣由:" + e.reason return None # 獲取帖子的標題 def getTitle(self, page): pattren = re.compile(r'<h3 class="core_title_txt pull-left text-overflow ".*?>(.*?)</h3>',re.S) items = re.search(pattren, page) if items: # print items.group(1).strip() return items.group(1).strip() else: return None # 提取帖子頁數 def getPageNum(self, page): pattren = re.compile(r'<ul class="l_posts_num">.*?<span class="red">(.*?)</span>',re.S) items = re.search(pattren, page) if items: # print items.group(1) return items.group(1).strip() else: return None # 提取正文內容 def getContent(self, page): pattren = re.compile(r'<div id="post_content_.*?>(.*?)</div>.*?<div class="core_reply_tail clearfix">.*?<span class="tail-info">(.*?)</span>.*?<span class="tail-info">(.*?)</span>',re.S) items = re.findall(pattren, page) contents = [] for item in items: content = [] title = "\n" + self.tool.replace(item[0]) + "\n" floor = item[1] writeTime = item[2] content.append(title.encode("utf-8")) content.append(floor.encode("utf-8")) content.append(writeTime.encode("utf-8")) contents.append(content) return contents # 設置文件名字 def setFileTIltle(self, title): if title is not None: self.file = open(title + ".txt", "w+") else: self.file = open(self.defaultTitle + ".txt", "w+") # 將內容寫入文件 def writeData(self, contents): for item in contents: if self.floorTag == '1': floorLineTime = "\n" + item[1] +"-----------------------------\n"+item[2] self.file.write(floorLineTime) self.file.write(item[0]) # floorLine = "\n" + item[''] # print "------------------%s--------------------%s-----------------"%(item[1],item[2]) # print self.tool.replace(item[0]) # print self.tool.replace(items[1]) # print items[1] def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTIltle(title) if pageNum == None: print "URL已經失效" try: print "該帖子一共有"+str(pageNum)+"頁" for i in range(1, int(pageNum)+1): print "正在寫入第" + str(i) + "頁數據" page = self.getPage(i) contents = self.getContent(page) self.writeData(contents) except IOError,e: print "寫入異常,緣由:"+e.message finally: print "寫入完成" print u"請輸入帖子代號" baseUrl = 'http://tieba.baidu.com/p/' + str(raw_input(u"http://tieba.baidu.com/p/")) seeLZ = raw_input("是否只獲取樓主的帖子是輸入1,否輸入0\n") floorTag = raw_input("是否寫入樓層與時間信息是輸入1,否輸入0\n") bdtb = BDTB(baseUrl, seeLZ, floorTag) bdtb.start()