BeautifulSoup是python的html解析庫,處理html很是方便php
pip install beautifulsoup4css
# python標準庫 BeautifulSoup(html,'html.parser') #lxml HTML 解析器 BeautifulSoup(html,'lxml) #html5lib BeautifulSoup(html,'html5lib')
python 標準庫解析器不須要第三方庫,處理效率通常,lxml比較快,須要C語言庫支持,html5lib不依賴第三方庫,可是效率比較低,容錯好。html
from bs4 import BeautifulSoup html = '''div id="sslct_menu" class="cl p_pop" style="display: none;"> <span class="sslct_btn" onClick="extstyle('')" title="默認"><i></i></span></div> <ul id="myitem_menu" class="p_pop" style="display: none;"> <li><a href="https://www.aisinei.org/forum.php?mod=guide&view=my">帖子</a></li> <li><a href="https://www.aisinei.org/home.php?mod=space&do=favorite&view=me">收藏</a></li>''' bs = BeautifulSoup(html) print(bs.prettify())
bs.prettify爲格式化輸出,效果以下html5
一樣能夠用本地的html文本建立,也能夠添加解析器lxmlpython
s =BeautifulSoup('test.html','lxml') print(s.prettify())
效果是同樣的ajax
html2 = ''' <li class="bus_postbd item masonry_brick"> <div class="bus_vtem"> <a href="https://www.aisinei.org/thread-17846-1-1.html" title="XIUREN秀人網 2018.11.13 NO.1228 貓寶 [50+1P]" class="preview" target="_blank"> "hello world" <img src="https://i.asnpic.win/block/a4/a42e6c63ef1ae20a914699f183d5204b.jpg" width="250" height="375" alt="XIUREN秀人網 2018.11.13 NO.1228 貓寶 [50+1P]"/> <span class="bus_listag">XIUREN秀人網</span> </a> <a href="https://www.aisinei.org/thread-17846-1-1.html" title="XIUREN秀人網 2018.11.13 NO.1228 貓寶 [50+1P]" target="_blank"> <div class="lv-face"><img src="https://www.aisinei.org/uc_server/avatar.php?uid=2&size=small" alt="發佈組小樂"/></div> <div class="t">XIUREN秀人網 2018.11.13 NO.1228 貓寶 [50</div> <div class="i"><span><i class="bus_showicon bus_showicon_v"></i>6402</span><span><i class="bus_showicon bus_showicon_r"></i>1</span></div> </a> </div> </li> ''' s2 = BeautifulSoup(html2,'lxml') print(s2.a) print(s2.a.name) print(s2.a.attrs)
節點tag 就是li,a,div這類,能夠看出經過屬性訪問,選擇出第一個匹配的結果。節點Tag也有名字,經過.name訪問。經過.attrs獲取節點的屬性。正則表達式
獲取節點文本經過.string便可,獲取節點的子孫節點的文本能夠經過textcookie
print(s2.a.string) print(s2.a.text)
獲取節點的子節點,能夠用.contents,也能夠用.children, .contents返回列表形式的直接子節點, .contents返回的是一個可迭代對象。session
print(s2.div.contents) print(s2.div.children) print(s2.div.contents[0]) for i in s2.div.children: print(i)
前兩個輸出同樣,後邊的分別取第一個節點,以及遍歷每個節點。一樣的道理,子孫節點,父節點,祖父節點,兄弟節點都採用這種方式獲取scrapy
#孫子節點 print(s2.div.descendants) #祖先節點 print(s2.div.parents) #直接父節點 print(s2.div.parent) #下一個兄弟節點 print(s2.a.next_sibling) #前一個兄弟節點 print(s2.a.previous_sibling)
print(s2.a["href"]) print(s2.a.get("href"))
如上兩種方式都能獲取屬性
經常使用的篩選函數有find_all和find,findall返回全部匹配的結果,find返回匹配結果的
print(s2.find('a')) print(s2.find_all('a')) print(s2.find_all(re.compile("^div"))) print(s2.find_all(["div","li"]))
能夠看出findall傳遞參數能夠是字符串,正則表達式,列表等等,其餘的方法相似屬性訪問同樣,有find_parents(),find_next_siblings()等等,用的時候再查吧。
若是你熟悉css選擇器的語法,BeautifulSoup一樣支持,並且很是便利。
#查找節點爲div的數據 print(s2.select('a')) #查找class爲bus_vtem的節點 print(s2.select('.bus_vtem')) #查找id爲ps的節點 print(s2.select('#ps'))
到目前爲止基本的BeautifulSoup已經介紹完,下面實戰抓取一段html,並用BeautifulSoup解析提取咱們須要的數據,這裏解析一段美女圖更新首頁,提取其中的資源地址。
#-*-coding:utf-8-*- import requests import re import time from lxml import etree from bs4 import BeautifulSoup USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0' COOKIES = '__cfduid=d78f862232687ba4aae00f617c0fd1ca81537854419; bg5D_2132_saltkey=jh7xllgK; bg5D_2132_lastvisit=1540536781; bg5D_2132_auth=479fTpQgthFjwwD6V1Xq8ky8wI2dzxJkPeJHEZyv3eqJqdTQOQWE74ttW1HchIUZpgsyN5Y9r1jtby9AwfRN1R89; bg5D_2132_lastcheckfeed=7469%7C1541145866; bg5D_2132_ulastactivity=2bbfoTOtWWimnqaXyLbTv%2Buq4ens5zcXIiEAhobA%2FsWLyvpXVM9d; bg5D_2132_sid=wF3g17; Hm_lvt_b8d70b1e8d60fba1e9c8bd5d6b035f4c=1540540375,1540955353,1541145834,1541562930; Hm_lpvt_b8d70b1e8d60fba1e9c8bd5d6b035f4c=1541562973; bg5D_2132_lastact=1541562986%09home.php%09spacecp' class AsScrapy(object): def __init__(self,pages=1): try: self.m_session = requests.Session() self.m_headers = {'User-Agent':USER_AGENT, #'referer':'https://www.aisinei.org/', } self.m_cookiejar = requests.cookies.RequestsCookieJar() for cookie in COOKIES.split(';'): key,value = cookie.split('=',1) self.m_cookiejar.set(key,value) except: print('init error!!!') def getOverView(self): try: req = self.m_session.get('https://www.aisinei.org/portal.php',headers=self.m_headers, cookies=self.m_cookiejar, timeout=5) classattrs={'class':'bus_vtem'} soup = BeautifulSoup(req.content.decode('utf-8'),'lxml') buslist = soup.find_all(attrs=classattrs) #print(len(buslist)) for item in buslist: if(item.a.attrs['title'] == "緊急通知!緊急通知!緊急通知!"): continue print(item.a.attrs['title']) print(item.a.attrs['href']) time.sleep(1) pass except: print('get over view error') if __name__ == "__main__": asscrapy = AsScrapy() asscrapy.getOverView()