python3.5爬蟲基礎urllib結合beautifulsoup實例

beautifulsoup模塊,能夠替代re模塊來代替正則表達式進行匹配html

小例子1:用beautifulsoup爬取淘寶首頁的漢字html5

 1 from bs4 import BeautifulSoup
 2 def tecent(url):
 3     response=urllib.request.urlopen(url)
 4     html=response.read()
 5     data=html.decode("utf-8")  #轉換編碼,默認轉換爲utf-8
 6     soup=BeautifulSoup(data,"html5lib")
 7     for list in soup.find_all("a"):
 8         if list.string==None:
 9             continue
10         else:
11             print(type(list.string))
12             print(list.string)   #暫時沒法將NavigableString類型進行轉換,此例子暫時在控制檯輸出
13             # with open("taobao1.txt","ab") as f:
14             #     f.write(list.string)
15 
16 if __name__=="__main__":
17     url="https://www.taobao.com/"
18     tecent(url)

小例子2:用Beautiful soup編寫一個抓取妹子圖頁面圖片的代碼正則表達式

 1 from bs4 import BeautifulSoup
 2 def taonvlang(url):
 3     res=urllib.request.urlopen(url).read()
 4     data=res.decode()
 5     soup=BeautifulSoup(data,"html5lib")   #將html代碼用Bs進行處理
 6     path="G:/taonvlang/"
 7     if not os.path.isdir(path):    #若是不存在該路徑,則建立路徑
 8         os.makedirs(path)
 9     count=1   #用於給圖片編號
10     for list in soup.find_all("img"):      #獲取img的全部內容
11         print(list)   #img標籤的全部內容
12         dict=list.attrs    #將該字段轉換爲字典
13         print(dict)
14         if "src" in dict:
15             image=dict["src"]    #取圖片地址
16             # print(image)
17             img=image[image.rfind(".")::]    #取出文件擴展名
18             # print(img)
19             image_path=str(count).zfill(5)+img
20             filepath=os.path.join(path,image_path)
21             with open(filepath,"wb") as f:
22                 image_data=urllib.request.urlopen(dict["src"]).read()
23                 f.write(image_data)
24             count+=1
25 
26 if __name__=="__main__":
27     url="http://www.mzitu.com/all"
28     taonvlang(url)
相關文章
相關標籤/搜索