知乎日報爬蟲,去掉了圖片,html轉義符處理,還有大多數有用無用的超連接。 另外,代碼有問題請提出,玩python沒多久~但願獲得更多的建議html
# -*- coding:utf-8 -*- import urllib2 import re import HTMLParser import sys reload(sys) sys.setdefaultencoding('utf8') #經過請求獲取HTML def getHtml(url): header={'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1','Referer' : '******'} request=urllib2.Request(url,None,header) response=urllib2.urlopen(request) text=response.read() return text #經過HTML解析出每條日報的連接 def getUrls(html): pattern = re.compile('http://daily.zhihu.com/story/(.*?)" >',re.S) items = re.findall(pattern,html) urls = [] for item in items: urls.append('http://daily.zhihu.com/story/' + item) return urls #解析日報內容 def getContent(url): html = getHtml(url) #先取出標題打印出來 pattern = re.compile('<h1 class="headline-title">(.*?)</h1>') items = re.findall(pattern,html) print '********************************************************************************************************************************************' print '****************************************************'+items[0]+'****************************************************' print '********************************************************************************************************************************************' #開始取文章內容 pattern = re.compile('<div.*?content">\n(.*?)</div>',re.S) items_withtag = re.findall(pattern,html) # print items_withtag[0] for item in items_withtag: for content in characterProcessing(item): print content #去掉文章內容中的標籤 def characterProcessing(html): htmlParser = HTMLParser.HTMLParser() #先去掉<p>和<li> pattern = re.compile('<p>(.*?)</p>|<li>(.*?)</li>.*?',re.S) items = re.findall(pattern,html) result = [] for index in items: if index != '': for content in index: tag = re.search('<.*?>',content) http = re.search('<.*?http.*?',content) html_tag = re.search('&',content) #處理html轉義符 if html_tag: content = htmlParser.unescape(content) #有連接直接跳過不作收集 if http: continue elif tag: #去掉<p>或<li>包裹的其餘的標籤,好比常見的<strong> pattern = re.compile('(.*?)<.*?>(.*?)</.*?>(.*)') items = re.findall(pattern,content) content_tags = '' if len(items)>0: for item in items: if len(item)>0: for item_s in item: content_tags = content_tags + item_s else: content_tags = content_tags + item_s content_tags = re.sub('<.*?>','',content_tags) result.append(content_tags) else: continue else: result.append(content) return result def main(): url = "http://zhihudaily.ahorn.me" html = getHtml(url) urls = getUrls(html) for url in urls: getContent(url) if __name__ == "__main__": main()