簡單來講,Beautiful Soup是python的一個庫,最主要的功能是從網頁抓取數據。官方解釋以下:html
''' Beautiful Soup提供一些簡單的、python式的函數用來處理導航、搜索、修改分析樹等功能。 它是一個工具箱,經過解析文檔爲用戶提供須要抓取的數據,由於簡單,因此不須要多少代碼就能夠寫出一個完整的應用程序。 '''
安裝html5
#安裝 Beautiful Soup pip install beautifulsoup4 #安裝解析器 Beautiful Soup支持Python標準庫中的HTML解析器,還支持一些第三方的解析器,其中一個是 lxml .根據操做系統不一樣,能夠選擇下列方法來安裝lxml: $ apt-get install Python-lxml $ easy_install lxml $ pip install lxml 另外一個可供選擇的解析器是純Python實現的 html5lib , html5lib的解析方式與瀏覽器相同,能夠選擇下列方法來安裝html5lib: $ apt-get install Python-html5lib $ easy_install html5lib $ pip install html5lib
解析器python
簡單使用瀏覽器
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 方式一: # soup = BeautifulSoup(html_doc, "lxml") # print(soup) # 方式二: # soup = BeautifulSoup(open('a.html'), "lxml") # print(soup) soup = BeautifulSoup(html_doc, "lxml") # print(soup) # 自動補全 soup.prettify() print(soup.p) # 獲取p標籤下的b標籤 print(soup.p.b) # 獲取p標籤下的b標籤下的文本 print(soup.p.b.text) # 找body內的全部標籤 print(soup.body.contents) # 獲取p標籤屬性 print(soup.p.attrs) # 獲取p標籤的孩子, 返回一個iter對象 print(list(soup.p.children)) # 獲取p標籤的子子孫孫 print(list(soup.p.descendants)) # 獲取p標籤的爸爸 print(soup.p.parent) # 獲取p標籤的爸爸, 獲取p標籤的爸爸的爸爸, 獲取p標籤的爸爸的爸爸的爸爸 print(list(soup.p.parents)) # 獲取a標籤內的href屬性 print(soup.a.attrs['href'])
搜索文檔樹cookie
from bs4 import BeautifulSoup html_doc =""" <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 搜索文檔樹 # 1.文本查找 # 經過文本查找p標籤 soup = BeautifulSoup(html_doc, "lxml") # print(soup) # 自動補全 soup.prettify() print(soup.find_all(name='p')) # 經過文本查找文本爲$37的p標籤 print(soup.find_all(name='p', text='$37')) # 經過文本查找id爲link3的a標籤 print(soup.find_all(name='a', attrs={"id": "link3"})) import re # 2.正則查找 # 經過正則查找全部p標籤 print(soup.find_all(name=re.compile("^p"))) # 經過正則查找全部a標籤 print(soup.find_all(name=re.compile("^a"))) # 經過正則查找全部id爲link的p標籤 print(soup.find_all(name="p", attrs={"id": re.compile("^link")})) # 經過正則查找全部id爲link的a標籤 print(soup.find_all(name="a", attrs={"id": re.compile("^link")})) # 經過正則查找全部class爲story的p標籤 print(soup.find_all(name="p", attrs={"class": re.compile("story")})) # 3.列表 # 經過列表查找全部的a、p標籤 print(soup.find_all(name=['p', 'a'])) # 經過列表查找全部的正則匹配有Elsie的文本 print(soup.find_all(text=[re.compile("Elsie")])) # 經過列表查找全部的正則匹配有Elsie的文本的a標籤 print(soup.find_all(name=['a'], text=[re.compile("Elsie")])) # 4.True # 獲取全部標籤 print(soup.find_all(name=True)) # 獲取全部有id的a標籤 print(soup.find_all(name="a", attrs={"id": True})) # # 獲取全部有class的a標籤 print(soup.find_all(name="a", attrs={"class": True})) # 5.方法 def have_id_not_class(a): # if tag.has_attr('id') and not tag.has_attr('class'): # return tag if a.has_attr('class') and not a.has_attr('id'): return a # 經過方法查找全部有class沒id的標籤 print(soup.find_all(have_id_not_class))
給新熱抽屜網1-100頁全部用戶點贊函數
import requests # 一: 訪問主頁獲取cookies index_res = requests.get('https://dig.chouti.com/', headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' }) # print(index_res.cookies.get_dict()) index_cookies = index_res.cookies.get_dict() ''' url: https://dig.chouti.com/login method: POST headers: user-agent: referer: cookie form_data: phone: 8615622792660 password: k46709394 oneMonth: 1 ''' # 二 攜帶主頁的cookies進行登錄 login_url = 'https://dig.chouti.com/login' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'referer': 'https://dig.chouti.com/', } form_data = { 'phone': '8615622792660', 'password': 'k46709394', 'oneMonth': '1' } # 往登錄鏈接發送post請求進行登錄 login_res = requests.post(login_url, cookies=index_cookies, headers=headers, data=form_data) print(login_res) # 登錄成功返回文本 print(login_res.text) # 對Id=25068561的用戶進行點贊 # vote_res = requests.post('https://dig.chouti.com/link/vote?linksId=25068561', headers=headers, cookies=index_cookies) # print(vote_res.text) ''' url: https://dig.chouti.com/all/hot/recent/1 headers: ''' from bs4 import BeautifulSoup # 循環拼接1-100頁的 特推 頁面 for line in range(1, 100): hot_url = 'https://dig.chouti.com/all/hot/recent/%s' % line # 往特推頁面發送get請求獲取當前頁全部用戶 hot_res = requests.get(hot_url, headers=headers, ) # print(hot_res.text) soup = BeautifulSoup(hot_res.text, 'lxml') # 經過bs4獲取到全部id爲content-list用戶特推的div標籤 div_1 = soup.find(name='div', attrs={"id": "content-list"}) # print(div_1) # # 經過bs4獲取到全部class爲item的用戶特推的div標籤 div_s = soup.find_all(name='div', attrs={"class": "item"}) for div in div_s: # 獲取全部用戶的id # 經過bs4查找到class爲part2的div,而後獲取屬性中的share-linkid進而獲取id值 id = div.find(name='div', attrs={"class": "part2"}).get("share-linkid") # print(id) # 對全部用戶進行點贊 vote_res = requests.post('https://dig.chouti.com/link/vote?linksId=%s' % id, headers=headers, cookies=index_cookies) print(vote_res.text) ''' {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53191784515","likedTime":"1552034126796000","lvCount":"6","nick":"人生苦短灬及時行樂","uvCount":"430","voteTime":"小於1分鐘前"}}}