今天說一下關於爬取數據解析的方式---->XPATH,XPATH是解析方式中最重要的一種方式html
1.安裝:pip install lxmlpython
2.原理瀏覽器
1. 獲取頁面源碼數據dom
2.實例化一個etree的對象,而且將頁面源碼數據加載到該對象中ide
3.調用該對象的xpath方法進行指定標籤的定位函數
4.注意:xpath函數必須結合着xpath表達式進行標籤訂位和內容捕獲編碼
說了也不明白,直接上例子!!!!加密
1.解析58二手房的相關數據url
#引用requests import requests #引用lxml from lxml import etree #地址 url = 'https://bj.58.com/ershoufang/sub/l16/s2242/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.bdpcpz_bt&PGTID=0d30000c-0000-1139-b00c-643d0d315a04&ClickID=1' #假裝的請求頭,證實我是瀏覽器 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } #獲取當前整個頁面 page_text = requests.get(url,headers=headers).text #做用於當前頁面以後再解析數據 tree = etree.HTML(page_text) #//ul[@class="house-list-wrap"]/li這就是lxml解析,//表明前面的層次 li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') # print(li_list)#獲得每個<Element li at 0x202a8c62288>這玩意 #再次循環 for li in li_list: #再次解析獲得準確的數據!!! title = li.xpath('./div[2]/h2[1]/a/text()')[0] print(title)
2.福利福利!!!!下載彼岸圖網中的圖片數據spa
import os import requests from lxml import etree #這裏注意,這是python3中的寫法!!! import urllib.request url = 'http://pic.netbian.com/4kmeinv/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } #二話不說直接發情再說 response = requests.get(url,headers=headers) #若是當前根目錄下沒有imgs文件夾,就建立!!! if not os.path.exists('./imgs'): os.mkdir('./imgs') #獲得請求數據 page_text = response.text #做用當前頁面 tree = etree.HTML(page_text) #lxml解析 li_list = tree.xpath('//div[@class="slist"]/ul/li') #循環獲得準確的數據 for li in li_list: img_name = li.xpath('./a/b/text()')[0] # 處理中文亂碼!不要理解記住就ok img_name = img_name.encode('ISO-8859-1').decode('gbk') #拼接完整的地址 img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] #圖片的名字 img_path = './imgs/' + img_name + '.jpg' #這裏避免打開文件就用urllib直接寫入 urllib.request.urlretrieve(url=img_url,filename=img_path)
3.解析全部城市名稱(https://www.aqistudy.cn/historydata/)
import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } res = requests.get(url=url, headers=headers).text tree = etree.HTML(res) city_list = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()') # 邏輯運算符,這裏 | 表示或的關係 city = ''.join(city_list)
4.煎蛋網的爬去圖片
# 煎蛋網圖片 import requests from lxml import etree #base對於加魔數據進行解密 import base64 import os import urllib.request if not os.path.exists('./jiandan'): os.mkdir('./jiandan') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } url = 'http://jandan.net/ooxx' res1 = requests.get(url, headers=headers).text tree = etree.HTML(res1) span_list = tree.xpath('//span[@class="img-hash"]/text()') for span_hash in span_list: #對於加密數據進行解密,編碼是utf-8而且拼接完整的url img_url = 'http:' + base64.b64decode(span_hash).decode('utf8') #獲得具體的數據 img_data = requests.get(url=img_url, headers=headers).content filepath = './jiandan/' + img_url.split('/')[-1] urllib.request.urlretrieve(url=img_url, filename=filepath) print(filepath, '下載完成!') print('over')
5.爬去簡歷模板
import requests from lxml import etree import random import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } if not os.path.exists('./jianli'): os.mkdir('./jianli') #爬去前4頁的數據 for i in range(1, 4): if i == 1: #第一頁數據 url = 'http://sc.chinaz.com/jianli/free.html' else: #不是第一頁的數據,進行數據的拼接 url = 'http://sc.chinaz.com/jianli/free_%s.html' % (i) response = requests.get(url=url, headers=headers) #字符編碼改一下,不然出現這種問題:æ±èçµåçç®åå è´¹ä¸è½½ 下載完成! response.encoding = 'utf8' res = response.text tree = etree.HTML(res) a_list = tree.xpath('//a[@class="title_wl"]') for a in a_list: name = a.xpath('./text()')[0] jl_url = a.xpath('./@href')[0] response = requests.get(url=jl_url, headers=headers) response.encoding = 'utf8' res1 = response.text tree = etree.HTML(res1) download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href') download_url = random.choice(download_url_list) res3 = requests.get(url=download_url, headers=headers).content filepath = './jianli/' + name + '.rar' #若是上邊是content,寫入的時候記得’wb' with open(filepath, 'wb') as f: f.write(res3) print(name, '下載完成!') print('over')
6.站長直接圖片下載,圖片懶加載
import requests from lxml import etree import os import urllib import urllib.request if not os.path.exists('./tupian'): os.mkdir('./tupian') url = 'http://sc.chinaz.com/tupian/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } response = requests.get(url=url, headers=headers) response.encoding = 'utf8' res = response.text tree = etree.HTML(res) url_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2') # img標籤是僞屬性src2,當圖片滾動到視野內時變爲 src for url in url_list: filepath = './tupian/' + url.rsplit('/', 1)[-1] urllib.request.urlretrieve(url, filepath) print(filepath, '下載完成!') print('over')