python正則與網頁操做練習三:
html
下載其個51 blog的全部文章的功能已實現。python
不足:沒有任何異常檢測,想到那就寫到那,第一次用類用得也是亂七八糟的。
web
import re import urllib.request import os #51cto urlcode=gb18030 class down51web: s_url='' s_blogid='' s_blogpages='' s_html='' s_code='' def __init__(self,url,code): self.s_url=url self.s_code=code def get_html(self): self.s_html=urllib.request.urlopen(self.s_url).read().decode(self.s_code) return self.s_html def get_page(self,r_page): if len(self.s_html) > 0: m_pages=r_page.search(self.s_html) if m_pages: self.s_blogpages=m_pages.group(1) return self.s_blogpages def get_blogid(self,r_blogid): if len(self.s_html) > 0: m_blogid=r_blogid.search(self.s_html) if m_blogid: self.s_blogid=m_blogid.group(1).split('/')[1] return self.s_blogid def get_blogpagelist(self): bloglist=[] if len(self.s_blogid)>0 and len(self.s_blogpages)>0: for i in range(1,int(self.s_blogpages)+1): bloglist.append(self.s_url+'/'+self.s_blogid+'/p-'+str(i)) return bloglist def get_pagelist(self,r_list,url): bloglist=[] self.s_url=url s_tmphtml=self.get_html() if len(s_tmphtml) > 0: bloglist=r_list.findall(s_tmphtml) return bloglist def get_htmltofile(self,r_getcont,r_getimg,url,path,imgpath,filename): s_tmphtml='' self.s_url=url s_tmp=self.get_html() m=r_getcont.search(s_tmp) if m: s_tmphtml=m.group(1) l_img=r_getimg.findall(s_tmphtml) #圖片連接 s_tmphtml=re.sub('(?<=src=")http://([^/]+/){1,5}','img//',s_tmphtml) with open(path+filename,'w') as fhtml: fhtml.write(s_tmphtml) if len(l_img)>0: dic={} for l in l_img: imgname=l.split('/')[-1] dic[l]=imgname for l1 in dic.keys(): imgdate=urllib.request.urlopen(l1).read() with open(imgpath+dic[l1], "wb") as jpg: jpg.write(imgdate)
r_page=re.compile('頁數 \( [0-9]+/([0-9]+) \)(?=</div>)') #取blog的總頁數 r_blogid=re.compile('(?<=<div class="pages">)<a href=([^>]*)>') #取blog id號 r_list=re.compile('<h3 class="artTitle"><a href="([^"]*)">([^<]*)</a>') #取文章標題 r_getcont=re.compile('<!--正文 begin-->(.*)<!--正文 end-->',re.S) #取正文正則 r_getimg=re.compile('<img onload="if\(this.width>650\) this.width=650;" src="([^"]+)"') #取正文內容的圖片url # x=re.sub('(?<=src=")http://([^/]+/){1,5}','',x) 刪除http://// blogurl='http://hxw168.blog.51cto.com' hxw=down51web(blogurl,'gb18030') hxw.get_html() pages=hxw.get_page(r_page) #取得頁數 blogid=hxw.get_blogid(r_blogid) #取得id list=hxw.get_blogpagelist() #取得文章列表 path=os.getcwd()+'\\'+blogid if not os.path.exists(path): os.mkdir(path) path=path+'\\' imgpath=path+'img' if not os.path.exists(imgpath): os.mkdir(imgpath) imgpath=imgpath+'\\' chk=1 for i in list: l1=hxw.get_pagelist(r_list,i) #print(l1) for x in l1: # print(xx+x[0]+'-------'+x[1]) hxw.get_htmltofile(r_getcont,r_getimg,blogurl+x[0],path,imgpath,str(chk)+x[1]+'.html') chk+=1
效果以下:app