爬蟲 xpath 獲取方式

回顧 bs4html

  • 實例化bs對象,將頁面源碼數據加載到該對象中
  • 定位標籤:find('name',class_='xxx') findall() select()
  • 將標籤中的文本內容獲取 string text get_text() a['href']

 

xpath

 

環境安裝: pip install lxmldom

原理解析:加密

獲取頁面的源碼數據url

實例化etree對象,並將頁面源碼數據加載到該對象中spa

調用該對象xpath方法進行指定標籤的定位.net

注意:xpath必須結合者xpath的表達式進行標籤訂位和內容捕獲代理

/html/head/title
//head/title
//titlecode

 

經過xpath進行獲取數據orm

#項目需求:解析58二手房的相關數據
import requests
from lxml import etree

url = 'https://bj.58.com/shahe/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d30000c-0047-e4e6-f587-683307ca570e&ClickID=1'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
fp = open('58.csv','w',encoding='utf-8')
for li in li_list:
    title = li.xpath('./div[2]/h2/a/text()')[0]
    price = li.xpath('./div[3]//text()')
    price = ''.join(price)
    fp.write(title+":"+price+'\n')
fp.close()
print('over')

#調用xpath 返回的是一個列表結構,使用索引

 

 

利用xpath處理中文亂碼xml

# ctrl+shift+x
# - 解析圖片數據:http://pic.netbian.com/4kmeinv/
import requests
from lxml import etree
import os
import urllib

url = 'http://pic.netbian.com/4kmeinv/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#response.encoding = 'utf-8'
if not os.path.exists('./imgs'):
    os.mkdir('./imgs')
page_text = response.text

tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
    img_name = li.xpath('./a/b/text()')[0]
    #處理中文亂碼
    img_name = img_name.encode('iso-8859-1').decode('gbk')
    img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
    img_path = './imgs/'+img_name+'.jpg'
    urllib.request.urlretrieve(url=img_url,filename=img_path)
    print(img_path,'下載成功!')
print('over!!!')



#經過encode('iso-8859-1').decode('gbk')編譯
#或使用response.encoding = 'utf-8'

 

 

xpath在遇到加密base64時解決加密a標籤

#【重點】下載煎蛋網中的圖片數據:http://jandan.net/ooxx
#數據加密  (反爬機制)
import requests
from lxml import etree
import base64
import urllib

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
url = 'http://jandan.net/ooxx'
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
img_hash_list = tree.xpath('//span[@class="img-hash"]/text()')
for img_hash in img_hash_list:
    img_url = 'http:'+base64.b64decode(img_hash).decode()
    img_name = img_url.split('/')[-1]
    urllib.request.urlretrieve(url=img_url,filename=img_name)

 

xpath獲取兩次a標籤進行獲取及分頁判斷

#爬取站長素材中的簡歷模板
import requests
import random
from lxml import etree
headers = {
    'Connection':'close', #當請求成功後,立刻斷開該次請求(及時釋放請求池中的資源)
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
url = 'http://sc.chinaz.com/jianli/free_%d.html'
for page in range(1,4):
    if page == 1:
        new_url = 'http://sc.chinaz.com/jianli/free.html'
    else:
        new_url = format(url%page)
    
    response = requests.get(url=new_url,headers=headers)
    response.encoding = 'utf-8'
    page_text = response.text

    tree = etree.HTML(page_text)
    div_list = tree.xpath('//div[@id="container"]/div')
    for div in div_list:
        detail_url = div.xpath('./a/@href')[0]
        name = div.xpath('./a/img/@alt')[0]

        detail_page = requests.get(url=detail_url,headers=headers).text
        tree = etree.HTML(detail_page)
        download_list  = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
        download_url = random.choice(download_list)
        data = requests.get(url=download_url,headers=headers).content
        fileName = name+'.rar'
        with open(fileName,'wb') as fp:
            fp.write(data)
            print(fileName,'下載成功')
            
            
            //*[@id="down"]/div[2]/ul/li[6]/a

 

xpath 利用 |  實現並集獲取數據

#解析全部的城市名稱
import requests
from lxml import etree
headers = {
    'Connection':'close', #當請求成功後,立刻斷開該次請求(及時釋放請求池中的資源)
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="bottom"]/ul/li |  //div[@class="bottom"]/ul/div[2]/li')
for li in li_list:
    city_name = li.xpath('./a/text()')[0]
    print(city_name)

 

proxies 代理設置

#設置請求的代理ip: www.goubanjia.com  快代理  西祠代理
#代理ip的類型必須和請求url的協議頭保持一致
url = 'https://www.baidu.com/s?wd=ip'

page_text = requests.get(url=url,headers=headers,proxies={'https':'61.7.170.240:8080'}).text

with open('./ip.html','w',encoding='utf-8') as fp:
    fp.write(page_text)

 

防衛機制:
robots

UA

數據加密

懶加載

代理ip

相關文章
相關標籤/搜索