測試頁面數據html
1 <html lang="en"> 2 <head> 3 <meta charset="UTF-8" /> 4 <title>測試bs4</title> 5 </head> 6 <body> 7 <div> 8 <p>百里守約</p> 9 </div> 10 <div class="song"> 11 <p>李清照</p> 12 <p>王安石</p> 13 <p>蘇軾</p> 14 <p>柳宗元</p> 15 <a href="http://www.song.com/" title="趙匡胤" target="_self"> 16 <span>this is span</span> 17 宋朝是最強大的王朝,不是軍隊的強大,而是經濟很強大,國民都頗有錢</a> 18 <a href="" class="du">總爲浮雲能蔽日,長安不見令人愁</a> 19 <img src="http://www.baidu.com/meinv.jpg" alt="" /> 20 </div> 21 <div class="tang"> 22 <ul> 23 <li><a href="http://www.baidu.com" title="qing">清明時節雨紛紛,路上行人慾斷魂,借問酒家何處有,牧童遙指杏花村</a></li> 24 <li><a href="http://www.163.com" title="qin">秦時明月漢時關,萬里長征人未還,但使龍城飛將在,不教胡馬度陰山</a></li> 25 <li><a href="http://www.126.com" alt="qi">岐王宅裏尋常見,崔九堂前幾度聞,正是江南好風景,落花時節又逢君</a></li> 26 <li><a href="http://www.sina.com" class="du">杜甫</a></li> 27 <li><a href="http://www.dudu.com" class="du">杜牧</a></li> 28 <li><b>杜小月</b></li> 29 <li><i>度蜜月</i></li> 30 <li><a href="http://www.haha.com" id="feng">鳳凰臺上鳳凰遊,鳳去臺空江自流,吳宮花草埋幽徑,晉代衣冠成古丘</a></li> 31 </ul> 32 </div> 33 </body> 34 </html>
經常使用xpath表達式應用python
# Xpath封裝在etree包中,etree封裝在lxml模塊中
from lxml import etree服務器
# 實例化一個本地的etree對象,而且頁面源碼數據加載到該對象中
tree = etree.parse('./index.html') # 返回對象的類型是ElementTree類型網絡
解析操作app
屬性定位
找到class屬性值爲song的div標籤
tree.xpath('//div[@class="song"]')dom
層級&索引定位:
找到class屬性值爲tang的div的直系子標籤ul下的第二個子標籤li下的直系子標籤a
ree.xpath('//div[@class="tang"]/ul/li[2]/a')ide
邏輯運算:
找到href屬性值爲空且class屬性值爲du的a標籤
tree.xpath('//a[@href="" and @class="du"]')函數
取文本:
/表示獲取某個標籤下的文本內容
//表示獲取某個標籤下的文本內容和全部子標籤下的文本內容
tree.xpath('//div[@class="song"]/p[1]/text()')[0]
tree.xpath('//div[@class="tang"]//text()')
tree.xpath('//div[@class="tang"]/ul/li[2]//text()')工具
注意:post
//text() 後面必定不要跟 [ ],應爲列表元素不止一個
取屬性:
tree.xpath('//div[@class="tang"]/ul/li[2]/a/@href')[0]
1 # Xpath封裝在etree包中,etree封裝在lxml模塊中 2 from lxml import etree 3 #實例化一個本地的etree對象,而且頁面源碼數據加載到該對象中 4 tree = etree.parse('./index.html') # 返回對象的類型是ElementTree類型 5 6 # 解析操作 7 #屬性定位 8 #找到class屬性值爲song的div標籤 9 # tree.xpath('//div[@class="song"]') 10 11 # 層級&索引定位: 12 #找到class屬性值爲tang的div的直系子標籤ul下的第二個子標籤li下的直系子標籤a 13 # tree.xpath('//div[@class="tang"]/ul/li[2]/a') 14 15 #邏輯運算: 16 #找到href屬性值爲空且class屬性值爲du的a標籤 17 # tree.xpath('//a[@href="" and @class="du"]') 18 19 # 取文本: 20 # /表示獲取某個標籤下的文本內容 21 # //表示獲取某個標籤下的文本內容和全部子標籤下的文本內容 22 # tree.xpath('//div[@class="song"]/p[1]/text()')[0] 23 # tree.xpath('//div[@class="tang"]//text()') 24 # tree.xpath('//div[@class="tang"]/ul/li[2]//text()') 25 # 取屬性: 26 tree.xpath('//div[@class="tang"]/ul/li[2]/a/@href')[0]
使用xpath表達式進行數據解析
1.下載:pip install lxml 2.導包:from lxml import etree 3.將html文檔或者xml文檔轉換成一個etree對象,而後調用對象中的方法查找指定的節點 3.1 本地文件:tree = etree.parse(文件名) tree.xpath("xpath表達式") 3.2 網絡數據:tree = etree.HTML(網頁內容字符串) tree.xpath("xpath表達式")
項目需求:獲取58同城北京昌平區的二手房的地址,價格,描述
https://bj.58.com/changping/ershoufang/?PGTID=0d30000c-0047-e853-04ee-93a15ab7eede&ClickID=1
1 import requests 2 from lxml import etree 3 4 # 獲取頁面源碼數據 5 url = 'https://bj.58.com/changping/ershoufang/?PGTID=0d30000c-0047-eddf-b503-3e2a5e562019&ClickID=1' 6 headers={ 7 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 8 } 9 page_text = requests.get(url=url,headers=headers).text 10 # print(page_text) 11 12 all_dict_list = [] 13 14 #實例化etree對象,將頁面源碼數據加載到該對象中 15 tree = etree.HTML(page_text) 16 li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') 17 for li in li_list: # li表示的是頁面中每個li標籤對象 18 tilte = li.xpath('.//div[@class="list-info"]/h2/a/text()')[0] 19 detail_url = li.xpath('.//div[@class="list-info"]/h2/a/@href')[0] 20 if not "https:" in detail_url: 21 detail_url = "https:" + detail_url 22 23 price = li.xpath('.//div[@class="price"]/p//text()') 24 price = ''.join(price) # 將價格轉化成字符串類型 25 # print(tilte,detail_url,price) 26 27 28 # 對象詳情頁發送請求,獲取頁面數據 29 detail_page_text = requests.get(url=detail_url,headers=headers).text 30 tree = etree.HTML(detail_page_text) 31 desc = tree.xpath('//div[@class="general-item-wrap"]//text()') # 獲取當前div下面的全部內容 32 desc = ''.join(desc).strip(" \n \b \t") 33 # print(desc) 34 35 dic = { 36 "tilte":tilte, 37 "price":price, 38 "desc":desc 39 } 40 41 all_dict_list.append(dic) 42 print(all_dict_list) 43
加密數據的爬取
項目需求:爬取煎蛋網中圖片數據 http://jandan.net/ooxx
1 #查看頁面源碼:發現全部圖片的src值都是同樣的。 2 #簡單觀察會發現每張圖片加載都是經過jandan_load_img(this)這個js函數實現的。 3 #在該函數後面還有一個class值爲img-hash的標籤,裏面存儲的是一組hash值,該值就是加密後的img地址 4 #加密就是經過js函數實現的,因此分析js函數,獲知加密方式,而後進行解密。 5 #經過抓包工具抓取起始url的數據包,在數據包中全局搜索js函數名(jandan_load_img),而後分析該函數實現加密的方式。 6 #在該js函數中發現有一個方法調用,該方法就是加密方式,對該方法進行搜索 7 #搜索到的方法中會發現base64和md5等字樣,md5是不可逆的因此優先考慮使用base64解密
import requests from lxml import etree import base64 import os from urllib import request # 獲取源碼數據 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } url = 'http://jandan.net/ooxx' page_text = requests.get(url=url,headers=headers).text # print(page_text) # 建立一個文件夾 if not os.path.exists("jiandan"): os.mkdir("jiandan") #實例化etree對象,解析src的密文數據 tree = etree.HTML(page_text) li_list = tree.xpath('//ol[@class="commentlist"]/li') for li in li_list: src_code = li.xpath('.//span[@class="img-hash"]/text()') if len(src_code) != 0: src_code = src_code[0] src = base64.b64decode(src_code).decode() src = "https:" + src # print(src) # 把爬取到的圖片存到文件夾中 imgPath = "jiandan/" + src.split("/")[-1] request.urlretrieve(url=src,filename=imgPath) print("imgPath" + "下載完畢!!!")
1 import requests 2 from lxml import etree 3 import base64 4 import os 5 from urllib import request 6 7 # 獲取源碼數據 8 headers={ 9 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 10 } 11 url = 'http://jandan.net/ooxx' 12 page_text = requests.get(url=url,headers=headers).text 13 # print(page_text) 14 15 # 建立一個文件夾 16 if not os.path.exists("jiandan1"): 17 os.mkdir("jiandan1") 18 19 #實例化etree對象,解析src的密文數據 20 tree = etree.HTML(page_text) 21 src_code_list = tree.xpath('//span[@class="img-hash"]/text()') 22 for src_code in src_code_list: # src_code拿到的是每個加密後src地址 23 # print(src_code) 24 src = base64.b64decode(src_code).decode() # src拿到的是解密後的src地址 25 src = "https:" + src 26 imgPath = "jiandan1/" + src.split("/")[-1] 27 request.urlretrieve(url=src,filename=imgPath) 28 print(imgPath + "下載完成") 29 print("下載完畢!!!")
文件爬取
項目需求:爬取站長素材中的免費簡歷模板 http://sc.chinaz.com/jianli/free.html
import requests from lxml import etree import random import os # 建立一個存放簡歷的文件夾 if not os.path.exists("jianli"): os.mkdir("jianli") # 獲取源碼數據 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } url = 'http://sc.chinaz.com/jianli/free.html' # response = requests.get(url=url,headers=headers) # response.encoding = "utf-8" # page_text = response.text page_text = requests.get(url=url,headers=headers).content.decode() # print(page_text) #實例etree對象 tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: detail_url = div.xpath('./a/@href')[0] name = div.xpath('./a/img/@alt')[0] # print(detail_url,name) detail_page_name = requests.get(url=detail_url,headers=headers).text tree = etree.HTML(detail_page_name) download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href') # print(download_url_list) # 下載簡歷模板的全部url download_url = random.choice(download_url_list) jianli_data = requests.get(url=download_url,headers=headers).content # 返回的是簡歷的二進制文件 file_path = 'jianli/'+name+'.rar' with open(file_path,'wb') as fb: fb.write(jianli_data) print(file_path + "下載成功!!")
1 # 處理多頁 2 import requests 3 from lxml import etree 4 import random 5 import os 6 7 # 建立一個存放簡歷的文件夾 8 if not os.path.exists("jianli"): 9 os.mkdir("jianli") 10 11 # 獲取源碼數據 12 headers={ 13 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 14 } 15 16 url = 'http://sc.chinaz.com/jianli/free_%d.html' 17 start_page = input("start_page:") 18 end_page = input("end_page") 19 for page in range(int(start_page),int(end_page)+1): 20 if page == 1: 21 new_url = 'http://sc.chinaz.com/jianli/free.html' 22 else: 23 new_url = format(url%page) 24 25 # response = requests.get(url=url,headers=headers) 26 # response.encoding = "utf-8" 27 # page_text = response.text 28 page_text = requests.get(url=new_url,headers=headers).content.decode() 29 # print(page_text) 30 31 #實例etree對象 32 tree = etree.HTML(page_text) 33 div_list = tree.xpath('//div[@id="container"]/div') 34 for div in div_list: 35 detail_url = div.xpath('./a/@href')[0] 36 name = div.xpath('./a/img/@alt')[0] 37 # print(detail_url,name) 38 detail_page_name = requests.get(url=detail_url,headers=headers).text 39 tree = etree.HTML(detail_page_name) 40 download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href') 41 # print(download_url_list) # 下載簡歷模板的全部url 42 download_url = random.choice(download_url_list) 43 44 jianli_data = requests.get(url=download_url,headers=headers).content # 返回的是簡歷的二進制文件 45 file_path = 'jianli/'+name+'.rar' 46 with open(file_path,'wb') as fb: 47 fb.write(jianli_data) 48 print(file_path + "下載成功!!")
視頻爬取
項目需求:爬取梨視頻體育界面的視頻 https://www.pearvideo.com/category_9
1 import requests 2 from lxml import etree 3 import re 4 import os 5 6 if not os.path.exists("vidio"): 7 os.mkdir("vidio") 8 9 # 獲取源碼數據 10 headers={ 11 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 12 } 13 url = 'https://www.pearvideo.com/category_9' 14 page_text = requests.get(url=url,headers=headers).text 15 # print(page_text) 16 17 # 實例化一個etree對象 18 tree = etree.HTML(page_text) 19 a_href_list = tree.xpath('//div[@class="vervideo-bd"]/a/@href') 20 for a_href in a_href_list: 21 # 拼接視頻詳情的頁面的url地址 22 new_url = url.split("/")[0]+ "//" + url.split("/")[2] + "/" + a_href 23 #拿到視頻詳情的頁面 24 detail_page_text = requests.get(url=new_url,headers=headers).text 25 26 # 拿到視頻視頻連接的js文件 27 tree = etree.HTML(detail_page_text) 28 video_path = tree.xpath('//*[@id="detailsbd"]/div[1]/script[1]/text()')[0] 29 # print(len(req)) 30 # 正則匹配 31 ex = 'srcUrl="(.*?)",vdoUrl' 32 33 vidio_true_path= re.findall(ex,video_path,re.S)[0] # 獲取視頻的url地址 34 # 獲取當前視頻的名稱 35 vidio_name = tree.xpath('//div[@id="poster"]/img/@alt')[0] 36 37 # 拼接一個視頻存放路徑 38 file_path = "vidio/" + vidio_name + ".mp4" 39 # 獲取視頻的內容 40 vidio_data = requests.get(url=vidio_true_path,headers=headers).content 41 with open(file_path,"wb") as fp: 42 fp.write(vidio_data) 43 print("file_path" + "下載成功") 44 print("下載完成"
一個報錯的處理方式【重點】
報錯現象(問題):
在進行大量的請求發送的時,常常會報出這樣的一錯誤:
HTTPConnectPool(host:XX) Max retries exceeded with url.
緣由:
1.每次數據傳輸前客戶端要和服務器創建TCP鏈接,爲節省傳輸消耗,默認爲keep-alive,
即鏈接一次,傳輸屢次。然而若是鏈接遲遲不斷開的話,若鏈接池滿後則沒法產生鏈接對象,致使請求沒法發送
2.ip被封
3.請求頻率太頻繁
解決: 若是下列解決方法未生效,則能夠嘗試再次執行程序
1.設置請求頭中的Connection的值爲close,表示每次請求成功後斷開鏈接
2.更換請求ip
3.每次請求以前使用sleep睡一下,進行等待間隔
xpath表達式的另外一種表達方式
| 在xpath函數中表示或的意思
示例: 解析出全部城市的名稱 https://www.aqistudy.cn/historydata/
1 import requests 2 from lxml import etree 3 headers = { 4 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 5 } 6 url = 'https://www.aqistudy.cn/historydata/' 7 page_text = requests.get(url=url,headers=headers).text 8 9 tree = etree.HTML(page_text) 10 li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/div/li') 11 print(li_list)