簡介html
Python標準庫中提供了:urllib、urllib二、httplib等模塊以供Http請求,可是,它的 API 太渣了。它是爲另外一個時代、另外一個互聯網所建立的。它須要巨量的工做,甚至包括各類方法覆蓋,來完成最簡單的任務。前端
Requests 是使用 Apache2 Licensed 許可證的 基於Python開發的HTTP 庫,其在Python內置模塊的基礎上進行了高度的封裝,從而使得Pythoner進行網絡請求時,變得美好了許多,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。python
爬蟲的本質:模仿瀏覽器的行爲,爬取網頁信息。linux
# 一、無參數實例 import requests ret = requests.get('https://github.com/timeline.json') print ret.url print ret.text # 二、有參數實例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.get("http://httpbin.org/get", params=payload) print ret.url print ret.text
二、POST請求git
# 一、基本POST實例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post("http://httpbin.org/post", data=payload) print ret.text # 二、發送請求頭和數據實例 import requests import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text print ret.cookies
三、其餘請求github
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基礎上構建 requests.request(method, url, **kwargs)
請求的參數redis
1 url 2 headers 3 cookies 4 params 5 data,傳請求體 requests.post( ..., data={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123 6 json,傳請求體 requests.post( ..., json={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123} 7 代理 proxies # 無驗證 proxie_dict = { "http": "61.172.249.96:80", "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 驗證代理 from requests.auth import HTTPProxyAuth proxyDict = { 'http': '77.75.105.165', 'https': '77.75.106.165' } auth = HTTPProxyAuth('用戶名', '密碼') r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth) print(r.text) ----------------------------------------------------------------------------------------- 8 文件上傳 files # 發送文件 file_dict = { 'f1': open('xxxx.log', 'rb') } requests.request( method='POST', url='http://127.0.0.1:8000/test/', files=file_dict ) 9 認證 auth 內部: 用戶名和密碼,用戶和密碼加密,放在請求頭中傳給後臺。 - "用戶:密碼" - base64("用戶:密碼") - "Basic base64("用戶|密碼")" - 請求頭: Authorization: "basic base64("用戶|密碼")" from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text) 10 超時 timeout # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) 11 容許重定向 allow_redirects ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text) 12 大文件下載 stream from contextlib import closing with closing(requests.get('http://httpbin.org/get', stream=True)) as r1: # 在此處理響應。 for i in r1.iter_content(): print(i) 13 證書 cert - 百度、騰訊 => 不用攜帶證書(系統幫你作了) - 自定義證書 requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem") requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 14 確認 verify =False
關於auth認證算法
認證 auth 瀏覽器的彈窗認證,在瀏覽器中 內部: 用戶名和密碼,用戶和密碼加密,放在請求頭中傳給後臺。 - "用戶:密碼" - base64("用戶:密碼") - "Basic base64("用戶:密碼")" - 請求頭: Authorization: "basic base64("用戶:密碼")" request的 HTTPBasicAuth幫助作以上操做 from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text)
def param_method_url(): ret=requests.request(method='get', url='http://127.0.0.1:8000/test/') ret=requests.request(method='post', url='http://127.0.0.1:8000/test/')
import requests requests.get(url='http://127.0.0.1:8000/test/', params={'k1': 'v1', 'k2': 'v2'}) #他的本質與requests.get(url='xxxxx?k1=v1&k2=v2')
# 能夠是字典 # 能夠是字符串 # 能夠是字節 # 能夠是文件對象 # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data={'k1': 'v1', 'k2': '水電費'}) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1; k2=v2; k3=v3; k3=v4" # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1;k2=v2;k3=v3;k3=v4", # headers={'Content-Type': 'application/x-www-form-urlencoded'} # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件內容是:k1=v1;k2=v2;k3=v3;k3=v4 # headers={'Content-Type': 'application/x-www-form-urlencoded'} # )
#若是請求體是 payload的話則須要傳入json格式 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水電費'})
ret1 = requests.get( url='https://dig.chouti.com/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } ) ret1_cookies = ret1.cookies.get_dict() #獲取的ret1.cookies是訪問該url返回的cookies對象 #經過get_dict()獲取到字典類型的cookies
# 發送請求頭到服務器端 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水電費'}, headers={'Content-Type': 'application/x-www-form-urlencoded'} ) #具體須要什麼請求頭要看服務器端
# 發送文件 # file_dict = { # 'f1': open('readme', 'rb') # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 發送文件,定製文件名 # file_dict = { # 'f1': ('test.txt', open('readme', 'rb')) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 發送文件,定製文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf") # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 發送文件,定製文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'}) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) pass
設置超時時間,若是訪問超過超時時間就中止訪問 # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) pass
#是否容許重定向,默認爲true ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text)
BeautifulSoup是一個模塊,該模塊用於接收一個HTML或XML字符串,而後將其進行格式化,以後遍可使用他提供的方法進行快速查找指定元素,從而使得在HTML或XML中查找指定元素變得簡單。django
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story總共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html_doc, 'html.parse') # 找到第一個a標籤 tag1 = soup.find(name='a') # 找到全部的a標籤 tag2 = soup.find_all(name='a') # 找到id=link2的標籤 tag3 = soup.select('#link2')
安裝:json
pip3 install beautifulsoup4
1. name,標籤名稱
# tag = soup.find('a') # name = tag.name # 獲取 # print(name) # tag.name = 'span' # 設置 # print(soup)
2. attr,標籤屬性
# tag = soup.find('a') # attrs = tag.attrs # 獲取 # print(attrs) # tag.attrs = {'ik':123} # 設置 # tag.attrs['id'] = 'iiiii' # 設置 # print(soup)
3. children,全部子標籤
# body = soup.find('body') # v = body.children
4. children,全部子子孫孫標籤
# body = soup.find('body') # v = body.descendants
5. clear,將標籤的全部子標籤所有清空(保留標籤名)
# tag = soup.find('body') # tag.clear() # print(soup)
6. decompose,遞歸的刪除全部的標籤
# body = soup.find('body') # body.decompose() # print(soup)
7. extract,遞歸的刪除全部的標籤,並獲取刪除的標籤
# body = soup.find('body') # v = body.extract() # print(soup)
8. decode,轉換爲字符串(含當前標籤);decode_contents(不含當前標籤)
# body = soup.find('body') # v = body.decode() # v = body.decode_contents() # print(v)
9. encode,轉換爲字節(含當前標籤);encode_contents(不含當前標籤)
# body = soup.find('body') # v = body.encode() # v = body.encode_contents() # print(v)
10. find,獲取匹配的第一個標籤
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
11. find_all,獲取匹配的全部標籤
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v) # ####### 正則 ####### import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v) # ####### 方法篩選 ####### # def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # v = soup.find_all(name=func) # print(v) # ## get,獲取標籤屬性 # tag = soup.find('a') # v = tag.get('id') # print(v)
12. has_attr,檢查標籤是否具備該屬性
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
13. get_text,獲取標籤內部文本內容
# tag = soup.find('a') # v = tag.get_text('id') # print(v)
14. index,檢查標籤在某標籤中的索引位置
# tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v)
15. is_empty_element,是不是空標籤(是否能夠是空)或者自閉合標籤,
判斷是不是以下標籤:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
# tag = soup.find('br') # v = tag.is_empty_element # print(v)
16. 當前的關聯標籤
# soup.next # soup.next_element # soup.next_elements # soup.next_sibling # soup.next_siblings # # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # # tag.parent # tag.parents
17. 查找某標籤的關聯標籤
# tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 參數同find_all
18. select,select_one, CSS選擇器
soup.select("title") soup.select("p nth-of-type(3)") soup.select("body a") soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select('a[href]') soup.select('a[href="http://example.com/elsie"]') soup.select('a[href^="http://example.com/"]') soup.select('a[href$="tillie"]') soup.select('a[href*=".com/el"]') from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) print(type(tags), tags) from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) print(type(tags), tags)
19. 標籤的內容
# tag = soup.find('span') # print(tag.string) # 獲取 # tag.string = 'new content' # 設置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 遞歸內部獲取全部標籤的文本 # print(v)
20.append在當前標籤內部追加一個標籤
# tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.append(obj) # print(soup)
21.insert在當前標籤內部指定位置插入一個標籤
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup)
22. insert_after,insert_before 在當前標籤後面或前面插入
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup)
23. replace_with 在當前標籤替換爲指定標籤
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一個新來的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
24. 建立標籤之間的關係
# tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling)
25. wrap,將指定標籤把當前標籤包裹起來
# from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一個新來的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup
26. unwrap,去掉當前標籤,將保留其包裹的標籤
# tag = soup.find('a') # v = tag.unwrap() # print(soup)
更多參數官方:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests # ############## 方式一 ############## """ # ## 一、首先登錄任何頁面,獲取cookie i1 = requests.get(url="http://dig.chouti.com/help/service") i1_cookies = i1.cookies.get_dict() # ## 二、用戶登錄,攜帶上一次的cookie,後臺對cookie中的 gpsd 進行受權 i2 = requests.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" }, cookies=i1_cookies ) # ## 三、點贊(只須要攜帶已經被受權的gpsd便可) gpsd = i1_cookies['gpsd'] i3 = requests.post( url="http://dig.chouti.com/link/vote?linksId=8589523", cookies={'gpsd': gpsd} ) print(i3.text) """ # ############## 方式二 ############## """ import requests session = requests.Session() i1 = session.get(url="http://dig.chouti.com/help/service") i2 = session.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589523" ) print(i3.text) """ 抽屜新熱榜
返回主頁 春生 博客園 首頁 新隨筆 聯繫 訂閱 管理 隨筆 - 181 文章 - 2 評論 - 24 requests+BeautifulSoup詳解 簡介 Python標準庫中提供了:urllib、urllib二、httplib等模塊以供Http請求,可是,它的 API 太渣了。它是爲另外一個時代、另外一個互聯網所建立的。它須要巨量的工做,甚至包括各類方法覆蓋,來完成最簡單的任務。 Requests 是使用 Apache2 Licensed 許可證的 基於Python開發的HTTP 庫,其在Python內置模塊的基礎上進行了高度的封裝,從而使得Pythoner進行網絡請求時,變得美好了許多,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。 請求的方法 1、GET請求 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # 一、無參數實例 import requests ret = requests.get('https://github.com/timeline.json') print ret.url print ret.text # 二、有參數實例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.get("http://httpbin.org/get", params=payload) print ret.url print ret.text 2、POST請求 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 # 一、基本POST實例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post("http://httpbin.org/post", data=payload) print ret.text # 二、發送請求頭和數據實例 import requests import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text print ret.cookies 3、其餘請求 1 2 3 4 5 6 7 8 9 10 requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基礎上構建 requests.request(method, url, **kwargs) 請求的參數 常見參數 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 1 url 2 headers 3 cookies 4 params 5 data,傳請求體 requests.post( ..., data={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123 6 json,傳請求體 requests.post( ..., json={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123} 7 代理 proxies # 無驗證 proxie_dict = { "http": "61.172.249.96:80", "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 驗證代理 from requests.auth import HTTPProxyAuth proxyDict = { 'http': '77.75.105.165', 'https': '77.75.106.165' } auth = HTTPProxyAuth('用戶名', '密碼') r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth) print(r.text) ----------------------------------------------------------------------------------------- 8 文件上傳 files # 發送文件 file_dict = { 'f1': open('xxxx.log', 'rb') } requests.request( method='POST', url='http://127.0.0.1:8000/test/', files=file_dict ) 9 認證 auth 內部: 用戶名和密碼,用戶和密碼加密,放在請求頭中傳給後臺。 - "用戶:密碼" - base64("用戶:密碼") - "Basic base64("用戶|密碼")" - 請求頭: Authorization: "basic base64("用戶|密碼")" from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text) 10 超時 timeout # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) 11 容許重定向 allow_redirects ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text) 12 大文件下載 stream from contextlib import closing with closing(requests.get('http://httpbin.org/get', stream=True)) as r1: # 在此處理響應。 for i in r1.iter_content(): print(i) 13 證書 cert - 百度、騰訊 => 不用攜帶證書(系統幫你作了) - 自定義證書 requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem") requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 14 確認 verify =False 更多參數 參數列表 參數示例 官方文檔:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4 BeautifulSoup BeautifulSoup是一個模塊,該模塊用於接收一個HTML或XML字符串,而後將其進行格式化,以後遍可使用他提供的方法進行快速查找指定元素,從而使得在HTML或XML中查找指定元素變得簡單。 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story總共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") # 找到第一個a標籤 tag1 = soup.find(name='a') # 找到全部的a標籤 tag2 = soup.find_all(name='a') # 找到id=link2的標籤 tag3 = soup.select('#link2') 安裝: 1 pip3 install beautifulsoup4 使用示例: 1 2 3 4 5 6 7 8 9 10 11 from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> ... </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") 1. name,標籤名稱 1 2 3 4 5 # tag = soup.find('a') # name = tag.name # 獲取 # print(name) # tag.name = 'span' # 設置 # print(soup) 2. attr,標籤屬性 1 2 3 4 5 6 # tag = soup.find('a') # attrs = tag.attrs # 獲取 # print(attrs) # tag.attrs = {'ik':123} # 設置 # tag.attrs['id'] = 'iiiii' # 設置 # print(soup) 3. children,全部子標籤 1 2 # body = soup.find('body') # v = body.children 4. children,全部子子孫孫標籤 1 2 # body = soup.find('body') # v = body.descendants 5. clear,將標籤的全部子標籤所有清空(保留標籤名) 1 2 3 # tag = soup.find('body') # tag.clear() # print(soup) 6. decompose,遞歸的刪除全部的標籤 1 2 3 # body = soup.find('body') # body.decompose() # print(soup) 7. extract,遞歸的刪除全部的標籤,並獲取刪除的標籤 1 2 3 # body = soup.find('body') # v = body.extract() # print(soup) 8. decode,轉換爲字符串(含當前標籤);decode_contents(不含當前標籤) 1 2 3 4 # body = soup.find('body') # v = body.decode() # v = body.decode_contents() # print(v) 9. encode,轉換爲字節(含當前標籤);encode_contents(不含當前標籤) 1 2 3 4 # body = soup.find('body') # v = body.encode() # v = body.encode_contents() # print(v) 10. find,獲取匹配的第一個標籤 1 2 3 4 5 # tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag) 11. find_all,獲取匹配的全部標籤 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 # tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v) # ####### 正則 ####### import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v) # ####### 方法篩選 ####### # def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # v = soup.find_all(name=func) # print(v) # ## get,獲取標籤屬性 # tag = soup.find('a') # v = tag.get('id') # print(v) 12. has_attr,檢查標籤是否具備該屬性 1 2 3 # tag = soup.find('a') # v = tag.has_attr('id') # print(v) 13. get_text,獲取標籤內部文本內容 1 2 3 # tag = soup.find('a') # v = tag.get_text('id') # print(v) 14. index,檢查標籤在某標籤中的索引位置 1 2 3 4 5 6 7 # tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v) 15. is_empty_element,是不是空標籤(是否能夠是空)或者自閉合標籤, 判斷是不是以下標籤:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base' 1 2 3 # tag = soup.find('br') # v = tag.is_empty_element # print(v) 16. 當前的關聯標籤 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 # soup.next # soup.next_element # soup.next_elements # soup.next_sibling # soup.next_siblings # # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # # tag.parent # tag.parents 17. 查找某標籤的關聯標籤 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 參數同find_all 18. select,select_one, CSS選擇器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 soup.select("title") soup.select("p nth-of-type(3)") soup.select("body a") soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select('a[href]') soup.select('a[href="http://example.com/elsie"]') soup.select('a[href^="http://example.com/"]') soup.select('a[href$="tillie"]') soup.select('a[href*=".com/el"]') from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) print(type(tags), tags) from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) print(type(tags), tags) 19. 標籤的內容 1 2 3 4 5 6 7 8 9 10 11 12 13 # tag = soup.find('span') # print(tag.string) # 獲取 # tag.string = 'new content' # 設置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 遞歸內部獲取全部標籤的文本 # print(v) 20.append在當前標籤內部追加一個標籤 1 2 3 4 5 6 7 8 9 10 # tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.append(obj) # print(soup) 21.insert在當前標籤內部指定位置插入一個標籤 1 2 3 4 5 6 # from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup) 22. insert_after,insert_before 在當前標籤後面或前面插入 1 2 3 4 5 6 7 # from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup) 23. replace_with 在當前標籤替換爲指定標籤 1 2 3 4 5 6 # from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('div') # tag.replace_with(obj) # print(soup) 24. 建立標籤之間的關係 1 2 3 4 # tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling) 25. wrap,將指定標籤把當前標籤包裹起來 1 2 3 4 5 6 7 8 9 10 11 # from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一個新來的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup) 26. unwrap,去掉當前標籤,將保留其包裹的標籤 1 2 3 # tag = soup.find('a') # v = tag.unwrap() # print(soup) 更多參數官方:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/ 一大波"自動登錄"示例 按 Ctrl+C 複製代碼 按 Ctrl+C 複製代碼 複製代碼 #!/usr/bin/env python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup # ############## 方式一 ############## # # # 1. 訪問登錄頁面,獲取 authenticity_token # i1 = requests.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 攜帶authenticity_token和用戶名密碼等信息,發送用戶驗證 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "wupeiqi@live.com", # 'password': 'xxoo' # } # # i2 = requests.post('https://github.com/session', data=form_data, cookies=c1) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = requests.get('https://github.com/settings/repositories', cookies=c1) # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "項目:%s(%s); 項目路徑:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp) # ############## 方式二 ############## # session = requests.Session() # # 1. 訪問登錄頁面,獲取 authenticity_token # i1 = session.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 攜帶authenticity_token和用戶名密碼等信息,發送用戶驗證 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "wupeiqi@live.com", # 'password': 'xxoo' # } # # i2 = session.post('https://github.com/session', data=form_data) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = session.get('https://github.com/settings/repositories') # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "項目:%s(%s); 項目路徑:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp) 複製代碼 知乎 博客園 拉勾網 分類: 爬蟲 好文要頂 關注我 收藏該文 春生 關注 - 22 粉絲 - 68 +加關注 0 0 « 上一篇:Flask之flask-session » 下一篇:Scrapy框架基礎 posted @ 2018-06-25 23:01 春生 閱讀(66) 評論(0) 編輯 收藏 刷新評論刷新頁面返回頂部 發表評論 暱稱: 評論內容: 引用 粗體 連接 縮進 代碼 圖片 退出 訂閱評論 [Ctrl+Enter快捷鍵提交] 【推薦】超50萬C++/C#源碼: 大型實時仿真HMI組態CAD\GIS圖形源碼! 【推薦】專業便捷的企業級代碼託管服務 - Gitee 碼雲 相關博文: · python 安裝插件 requests、BeautifulSoup · Python 爬蟲—— requests BeautifulSoup · requests + BeautifulSoup + json · requests和BeautifulSoup · Requests與BeautifulSoup 最新新聞: · 蘋果大屏手機方面花了四年時間才遇上三星 在可摺疊手機方面呢? · 惹禍的就是它 圖解馬斯克的4925條推文 · FF:「遣散員工」傳聞爲誤讀 已召回百名員工 · IBM爲招聘網頁出現種族歧視選項致歉 · 認可吧星巴克,你就是個賣杯子的 » 更多新聞... 公告 暱稱:春生 園齡:1年2個月 粉絲:68 關注:22 +加關注 < 2019年2月 > 日 一 二 三 四 五 六 27 28 29 30 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 1 2 3 4 5 6 7 8 9 搜索 經常使用連接 個人隨筆 個人評論 個人參與 最新評論 個人標籤 隨筆分類 Ajax(2) Django model系統(5) Django-rest framework(8) DJango-templates系統(1) Django-處理流程(1) Django框架(21) Django-組件-😄😄😄(8) Flask(9) Git(4) go 語言(1) linux(10) MySQL(15) PyCharm 教程使用文檔(9) python(41) Python 經常使用模塊(1) RabbitMQ(1) Redis(5) requirements.txt(1) Tornado(1) Vue(2) wepsocket(1) 報錯(1) 靜態文件 各類工具 (7) 爬蟲(7) 前端(15) 區塊鏈(1) 算法(1) 網絡(2) 項目部署 上線(2) 項目實戰(4) 信號(1) 虛擬環境(2) 支付寶(1) 隨筆檔案 2019年2月 (1) 2018年12月 (11) 2018年11月 (1) 2018年10月 (5) 2018年8月 (9) 2018年7月 (20) 2018年6月 (32) 2018年5月 (19) 2018年4月 (19) 2018年3月 (33) 2018年2月 (6) 2018年1月 (14) 2017年12月 (11) 相冊 sdd(3) 最新評論 1. Re:Django之logging日誌 問一下,handlers和loggers都有level,那麼究竟以哪一個爲準handlers': { # 在終端打印 'console': { 'lev...... --樺仔 2. Re:Django之logging日誌 'file': { 'level': 'INFO', 'class': 'logging.handlers.TimedRotatingFileHandler......... --樺仔 3. Re:Django之logging日誌 感謝 --我好像在哪見過你 4. Re:Python經常使用的標準庫以及第三方庫有哪些? 樓主的資料太全了,對python的庫有了大體瞭解。 --高效快樂學習 5. Re:面向對象 春生總結 無敵 --騎驢老神仙 閱讀排行榜 1. Python經常使用的標準庫以及第三方庫有哪些?(12659) 2. redis之django-redis(2635) 3. Django之logging日誌(2060) 4. Flask之flask-script 指定端口(1782) 5. Celery 大量任務 分發(786) 評論排行榜 1. python 控制檯顏色(11) 2. 文件處理(5) 3. Django之logging日誌(3) 4. 面向對象 春生總結(2) 5. MySQL 簡潔 數據操做 增刪改查 記不住的 看這裏把(1) 推薦排行榜 1. python 控制檯顏色(9) 2. WEB框架之Tornado(2) 3. Django之logging日誌(2) 4. Python經常使用的標準庫以及第三方庫有哪些?(2) 5. Font Awesome矢量圖標框架(1) Copyright ©2019 春生
#!/usr/bin/env python # -*- coding:utf-8 -*- import time import requests from bs4 import BeautifulSoup session = requests.Session() i1 = session.get( url='https://www.zhihu.com/#signin', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup1 = BeautifulSoup(i1.text, 'lxml') xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'}) xsrf = xsrf_tag.get('value') current_time = time.time() i2 = session.get( url='https://www.zhihu.com/captcha.gif', params={'r': current_time, 'type': 'login'}, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', }) with open('zhihu.gif', 'wb') as f: f.write(i2.content) captcha = input('請打開zhihu.gif文件,查看並輸入驗證碼:') form_data = { "_xsrf": xsrf, 'password': 'xxooxxoo', "captcha": 'captcha', 'email': '424662508@qq.com' } i3 = session.post( url='https://www.zhihu.com/login/email', data=form_data, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) i4 = session.get( url='https://www.zhihu.com/settings/profile', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup4 = BeautifulSoup(i4.text, 'lxml') tag = soup4.find(id='rename-section') nick_name = tag.find('span',class_='name').string print(nick_name) 知乎
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import json import base64 import rsa import requests def js_encrypt(text): b64der = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB' der = base64.standard_b64decode(b64der) pk = rsa.PublicKey.load_pkcs1_openssl_der(der) v1 = rsa.encrypt(bytes(text, 'utf8'), pk) value = base64.encodebytes(v1).replace(b'\n', b'') value = value.decode('utf8') return value session = requests.Session() i1 = session.get('https://passport.cnblogs.com/user/signin') rep = re.compile("'VerificationToken': '(.*)'") v = re.search(rep, i1.text) verification_token = v.group(1) form_data = { 'input1': js_encrypt('wptawy'), 'input2': js_encrypt('asdfasdf'), 'remember': False } i2 = session.post(url='https://passport.cnblogs.com/user/signin', data=json.dumps(form_data), headers={ 'Content-Type': 'application/json; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'VerificationToken': verification_token} ) i3 = session.get(url='https://i.cnblogs.com/EditDiary.aspx') print(i3.text) 博客園
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import requests all_cookie = {} # ############### 1. 查看登陸頁面 ############### r1 = requests.get( url='https://passport.lagou.com/login/login.html', headers={ 'Host': 'passport.lagou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } ) all_cookie.update(r1.cookies.get_dict()) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # ############### 2. 用戶名密碼登陸 ############### r2 = requests.post( url='https://passport.lagou.com/login/login.json', headers={ 'Host': 'passport.lagou.com', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data={ 'isValidate': True, 'username': '15131255089', 'password': 'ab18d270d7126ea65915cc22c0d', 'request_form_verifyCode': '', 'submit': '', }, cookies=r1.cookies.get_dict() ) all_cookie.update(r2.cookies.get_dict()) # ############### 3. 用戶受權 ############### r3 = requests.get( url='https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r3.cookies.get_dict()) # ############### 4. 用戶認證 ############### r4 = requests.get( url=r3.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r4.cookies.get_dict()) r5 = requests.get( url=r4.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r5.cookies.get_dict()) r6 = requests.get( url=r5.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r6.cookies.get_dict()) r7 = requests.get( url=r6.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r7.cookies.get_dict()) # ############### 5. 查看我的頁面 ############### r5 = requests.get( url='https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, cookies=all_cookie ) print('武沛齊' in r5.text) # ############### 6. 查看 ############### r6 = requests.get( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-L-REQ-HEADER': "{deviceType:1}", 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', }, cookies=all_cookie ) r6_json = r6.json() all_cookie.update(r6.cookies.get_dict()) # ############### 7. 修改我的信息 ############### r7 = requests.put( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', 'X-Anit-Forge-Code': r6_json['submitCode'], 'X-Anit-Forge-Token': r6_json['submitToken'], 'X-L-REQ-HEADER': "{deviceType:1}", }, cookies=all_cookie, json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png", "positionName": '...', "introduce": '....'} ) print(r7.text)
from bs4 import BeautifulSoup class XSSFilter(object): __instance = None def __init__(self): # XSS白名單 self.valid_tags = { "font": ['color', 'size', 'face', 'style'], 'b': [], 'div': [], "span": [], "table": [ 'border', 'cellspacing', 'cellpadding' ], 'th': [ 'colspan', 'rowspan' ], 'td': [ 'colspan', 'rowspan' ], "a": ['href', 'target', 'name'], "img": ['src', 'alt', 'title'], 'p': ['align'], "pre": ['class'], "hr": ['class'], 'strong': [] } def __new__(cls, *args, **kwargs): if not cls.__instance: obj = object.__new__(cls, *args, **kwargs) cls.__instance = obj return cls.__instance def process(self, content): soup = BeautifulSoup(content, 'html.parser') # 遍歷全部HTML標籤 for tag in soup.find_all(): # 判斷標籤名是否在白名單中 if tag.name not in self.valid_tags: tag.hidden = True if tag.name not in ['html', 'body']: tag.hidden = True tag.clear() continue # 當前標籤的全部屬性白名單 attr_rules = self.valid_tags[tag.name] keys = list(tag.attrs.keys()) for key in keys: if key not in attr_rules: del tag[key] return soup.decode() #這裏返回的就是過濾完的內容 content=""" <p class='c1' id='i1'> asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf </p> <p> <strong class='c2' id='i2'>asdf</strong> <script>alert(123)</script> </p> <h2> asdf </h2> """ content = XSSFilter().process(content) print('content',content)