beautifulsoup模塊,能夠替代re模塊來代替正則表達式進行匹配html
小例子1:用beautifulsoup爬取淘寶首頁的漢字html5
1 from bs4 import BeautifulSoup 2 def tecent(url): 3 response=urllib.request.urlopen(url) 4 html=response.read() 5 data=html.decode("utf-8") #轉換編碼,默認轉換爲utf-8 6 soup=BeautifulSoup(data,"html5lib") 7 for list in soup.find_all("a"): 8 if list.string==None: 9 continue 10 else: 11 print(type(list.string)) 12 print(list.string) #暫時沒法將NavigableString類型進行轉換,此例子暫時在控制檯輸出 13 # with open("taobao1.txt","ab") as f: 14 # f.write(list.string) 15 16 if __name__=="__main__": 17 url="https://www.taobao.com/" 18 tecent(url)
小例子2:用Beautiful soup編寫一個抓取妹子圖頁面圖片的代碼正則表達式
1 from bs4 import BeautifulSoup 2 def taonvlang(url): 3 res=urllib.request.urlopen(url).read() 4 data=res.decode() 5 soup=BeautifulSoup(data,"html5lib") #將html代碼用Bs進行處理 6 path="G:/taonvlang/" 7 if not os.path.isdir(path): #若是不存在該路徑,則建立路徑 8 os.makedirs(path) 9 count=1 #用於給圖片編號 10 for list in soup.find_all("img"): #獲取img的全部內容 11 print(list) #img標籤的全部內容 12 dict=list.attrs #將該字段轉換爲字典 13 print(dict) 14 if "src" in dict: 15 image=dict["src"] #取圖片地址 16 # print(image) 17 img=image[image.rfind(".")::] #取出文件擴展名 18 # print(img) 19 image_path=str(count).zfill(5)+img 20 filepath=os.path.join(path,image_path) 21 with open(filepath,"wb") as f: 22 image_data=urllib.request.urlopen(dict["src"]).read() 23 f.write(image_data) 24 count+=1 25 26 if __name__=="__main__": 27 url="http://www.mzitu.com/all" 28 taonvlang(url)