官方文檔:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4html
Python標準庫中提供了:urllib、urllib二、httplib等模塊以供Http請求,可是,它的 API 太渣了。它是爲另外一個時代、另外一個互聯網所建立的。它須要巨量的工做,甚至包括各類方法覆蓋,來完成最簡單的任務。python
Requests 是使用 Apache2 Licensed 許可證的 基於Python開發的HTTP 庫,其在Python內置模塊的基礎上進行了高度的封裝,從而使得Pythoner進行網絡請求時,變得美好了許多,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。git
一、GET請求github
# 一、無參數實例 import requests ret = requests.get('https://github.com/timeline.json') print(ret.url) print(ret.text) # 二、有參數實例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.get("http://httpbin.org/get", params=payload) print (ret.url) print (ret.text)
二、POST請求正則表達式
# 一、基本POST實例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post("http://httpbin.org/post", data=payload) print(ret.text) # 二、發送請求頭和數據實例 import requests import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print(ret.text) print(ret.cookies)
三、其餘請求算法
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基礎上構建 requests.request(method, url, **kwargs)
四、更多參數json
參數 2.1 url
requests.post("http://httpbin.org/post", data=payload)
requests.request(method='post', url='http://127.0.0.1:8000/test/') 2.2 headers
# 發送請求頭到服務器端
requests.request(
method='POST', url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水電費'},
headers={'Content-Type': 'application/x-www-form-urlencoded'}
) 2.3 cookies
# 發送Cookie到服務器端
requests.request(
method='POST',
url='http://127.0.0.1:8000/test/',
data={'k1': 'v1', 'k2': 'v2'},
cookies={'cook1': 'value1'},
)
# 也可使用CookieJar(字典形式就是在此基礎上封裝)
from http.cookiejar import CookieJar
from http.cookiejar import Cookie
obj = CookieJar()
obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/',
secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None},
rfc2109=False, port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False) )
requests.request(method='POST', url='http://127.0.0.1:8000/test/', data={'k1': 'v1', 'k2': 'v2'}, cookies=obj)2.4 params
# - 能夠是字典
# - 能夠是字符串
# - 能夠是字節(ascii編碼之內)
requests.request(method='get', url='http://127.0.0.1:8000/test/', params={'k1': 'v1', 'k2': '水電費'})
requests.request(method='get', url='http://127.0.0.1:8000/test/', params="k1=v1&k2=水電費&k3=v3&k3=vv3")
requests.request(method='get', url='http://127.0.0.1:8000/test/', params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8')) 2.5 data,傳請求體
requests.request(
method='POST',
url='http://127.0.0.1:8000/test/',
data="k1=v1;k2=v2;k3=v3;k3=v4",
headers={'Content-Type': 'application/x-www-form-urlencoded'}
)
requests.request(
method='POST',
url='http://127.0.0.1:8000/test/',
data=open('data_file.py', mode='r', encoding='utf-8'),
#文件內容是:k1=v1;k2=v2;k3=v3;k3=v4
headers={'Content-Type': 'application/x-www-form-urlencoded'}
) requests.post( ..., data={'user':'root','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=root&pwd=123 2.6 json,傳請求體 # 將json中對應的數據進行序列化成一個字符串,json.dumps(...) # 而後發送到服務器端的body中,而且Content-Type是 {'Content-Type': 'application/json'} requests.request(
method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水電費'}
)
2.7代理
#無驗證
proxie_dict = { "http": "77.75.105.165", "https": "77.75.105.123", }
ret = requests.get("http://www.baidu.com",proxies=proxie_dict)
print(ret.headers)
#驗證代理
from requests.auth import HTTPProxyAuth
proxyDict = { 'http': '77.75.105.165', 'https': '77.75.105.165' }
auth = HTTPProxyAuth('username', 'mypassword')
r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
print(r.text)
2.8上傳文件 files
# 發送文件
file_dict = { 'f1': open('xxxx.log','rb') }
requests.request( method='POST', url = 'http://127.0.0.1:8000/test/', files=file_dict )
# 發送文件,定製文件名 file_dict = { 'f1': ('test.txt', open('readme', 'rb'))
} requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
# 發送文件,定製文件名 file_dict = { 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf") } requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
# 發送文件,定製文件名
file_dict = {
'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
}
requests.request(method='POST', url='http://127.0.0.1:8000/test/', files=file_dict)
2.9 認證 auth
內部: 用戶名和密碼,用戶名和密碼加密,放在請求頭中傳給後臺。
-"用戶|密碼" -base64("用戶:密碼")
-"basic base64("用戶|密碼")"
-請求頭: Authorization: "basic base64("用戶|密碼")"
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
ret = requests.get('https://api.github.com/user',auth=HTTPBasicAuth('qwqw','11213'))
2.10超時
ret1 = requests.get('http://google.com/', timeout=1)
print(ret1)
ret2 = requests.get('http://google.com/', timeout=(5, 1))
print(ret2)
2.11是否容許重定向
allow_redirects ret = requests.get("http://127.0.0.1:8000/test/",allow_redirects=False)
print(ret.text)
2.12 大文件下載stream
ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
print(ret.content)
ret.close()
from contextlib import closing with closing(requests.get('http://htttpbin.org/get',stream=True)) as r1: #在此處理響應
for i in r1.iter_content():
print(i)
2.13證書 cert
-百度、騰訊 =》不用攜帶證書(系統自動獲取證書)
-自定義證書
requests.get('http://127.0.0.1:8000/test/',cert="xxxx/xxx/xxx.pem")
requests.get('http://127.0.0.1:8000/test/',cert=("xxxx/xxx/xxx.pem","xxx.xx.xx.key"))
2.14 確認
verify = False
2.15session
import requests session = requests.Session() ### 一、首先登錄任何頁面,獲取cookie i1 = session.get(url="http://dig.chouti.com/help/service") ### 二、用戶登錄,攜帶上一次的cookie,後臺對cookie中的 gpsd 進行受權 i2 = session.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxxxxx", 'oneMonth': "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589623", ) print(i3.text)
五、總結api
1、爬蟲基本操做 a、爬蟲 -定向 -非定向 b、 下載頁面: 篩選: 正則表達式 =========開源模塊============================= 1、requests模塊 -方法 -參數 -session session = requests.Session() session.get() session.post() response = requests.get('http://www.autohome.com.cn/news/') response.txt 總結: response = requests.get('URL') response.tetx response.content response.encoding response.aparent_encoding response.status_code 2、beautisoup模塊 soup = BeautiSoup(response.txt,parser='html.parser') target=soup.find(id = 'auto-channel-lazyload-article') 總結: find #匹配的第一個 find_all#匹配的全部 soup = beautifulsoup('<html>...</html>',features='html.parser') v1 = soup.find('div')#找soup孩子裏面的第一個div v1 = soup.find(id='i1')#找soup孩子裏面的第一個id=i1 v1 = soup.find('div',id='i1')#找soup孩子裏面的第一個div而且id=i1 v2 = soup.find_all('div') v2 = soup.find_all(id='i1') v2 = soup.find_all('div',id='i1') obj = v1 obj = v2[0] obj.text#獲取對象的文本 obj.attrs#獲取對象的屬性 登陸: 頁面刷新:form 表單提交 頁面不刷新:Ajax提交 需求二: 經過程序自動登陸github requests 需求三:爬取GitHub -帶請求頭 -帶cookie -請求體中: commit: Sign in utf8: ✓ authenticity_token: iWlPKAsJ9nQNDaqC47P27GWx37a08iBv/0io8C4QPUluL1JxyWJSt0ZlgBBWv3BeFJ4ywbR5dKWzSqwzhILH6Q== login: Yun-Wangj password: yun258762 需求四:登陸拉勾網 -密碼加密 -找js,經過python實現加密方式 -找密文,密文<=>密文 -Referer頭是上一次請求的地址,可用於作防盜鏈 總結: 請求頭: user-agent referer host cookie 特殊請求頭,查看上一次請求獲取內容,如拉勾網 請求體: -原始數據 -原始數據 + token -密文 -找算法 -使用密文 兩種套路: -post登陸獲取cookie,之後攜帶cookie, -get獲取未受權cookie,post登陸攜帶cookie去受權,之後攜帶cookie
BeautifulSoup是一個模塊,該模塊用於接收一個HTML或XML字符串,而後將其進行格式化,瀏覽器
以後遍可使用他提供的方法進行快速查找指定元素,從而使得在HTML或XML中查找指定元素變得簡單。服務器
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story總共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") # 找到第一個a標籤 tag1 = soup.find(name='a') # 找到全部的a標籤 tag2 = soup.find_all(name='a') # 找到id=link2的標籤 tag3 = soup.select('#link2')
安裝:
pip3 install beautifulsoup4
使用示例:
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> ... </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml")
1. name,標籤名稱
# tag = soup.find('a') # name = tag.name # 獲取 # print(name) # tag.name = 'span' # 設置 # print(soup)
2. attr,標籤屬性
# tag = soup.find('a') # attrs = tag.attrs # 獲取 # print(attrs) # tag.attrs = {'ik':123} # 設置 # tag.attrs['id'] = 'iiiii' # 設置 # print(soup)
3. children,全部子標籤
# body = soup.find('body') # v = body.children
4. children,全部子子孫孫標籤
# body = soup.find('body') # v = body.descendants
5. clear,將標籤的全部子標籤所有清空(保留標籤名)
# tag = soup.find('body') # tag.clear() # print(soup)
6. decompose,遞歸的刪除全部的標籤
# body = soup.find('body') # body.decompose() # print(soup)
7. extract,遞歸的刪除全部的標籤,並獲取刪除的標籤
# body = soup.find('body') # v = body.extract() # print(soup)
8. decode,轉換爲字符串(含當前標籤);decode_contents(不含當前標籤)
# body = soup.find('body') # v = body.decode() # v = body.decode_contents() # print(v)
9. encode,轉換爲字節(含當前標籤);encode_contents(不含當前標籤)
# body = soup.find('body') # v = body.encode() # v = body.encode_contents() # print(v)
10. find,獲取匹配的第一個標籤
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
11. find_all,獲取匹配的全部標籤
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v) # ####### 正則 ####### import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v) # ####### 方法篩選 ####### # def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # v = soup.find_all(name=func) # print(v) # ## get,獲取標籤屬性 # tag = soup.find('a') # v = tag.get('id') # print(v)
12. has_attr,檢查標籤是否具備該屬性
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
13. get_text,獲取標籤內部文本內容
# tag = soup.find('a') # v = tag.get_text('id') # print(v)
14. index,檢查標籤在某標籤中的索引位置
# tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v)
15. is_empty_element,是不是空標籤(是否能夠是空)或者自閉合標籤,
判斷是不是以下標籤:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
# tag = soup.find('br') # v = tag.is_empty_element # print(v)
16. 當前的關聯標籤
# soup.next # soup.next_element # soup.next_elements # soup.next_sibling # soup.next_siblings # # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # # tag.parent # tag.parents
17. 查找某標籤的關聯標籤
# tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 參數同find_all
18. select,select_one, CSS選擇器
soup.select("title") soup.select("p nth-of-type(3)") soup.select("body a") soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select('a[href]') soup.select('a[href="http://example.com/elsie"]') soup.select('a[href^="http://example.com/"]') soup.select('a[href$="tillie"]') soup.select('a[href*=".com/el"]') from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) print(type(tags), tags) from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) print(type(tags), tags)
19. 標籤的內容
# tag = soup.find('span') # print(tag.string) # 獲取 # tag.string = 'new content' # 設置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 遞歸內部獲取全部標籤的文本 # print(v)
20.append在當前標籤內部追加一個標籤
# tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.append(obj) # print(soup)
21.insert在當前標籤內部指定位置插入一個標籤
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup)
22. insert_after,insert_before 在當前標籤後面或前面插入
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup)
23. replace_with 在當前標籤替換爲指定標籤
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一個新來的' # tag = soup.find('div') # tag.replace_with(obj) # print(soup)
24. 建立標籤之間的關係
# tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling)
25. wrap,將指定標籤把當前標籤包裹起來
# from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一個新來的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup)
26. unwrap,去掉當前標籤,將保留其包裹的標籤
# tag = soup.find('a') # v = tag.unwrap() # print(soup)
更多參數官方:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
自動登陸示例:
抽屜新熱榜
import requests # ############## 方式一 ############## """ # ## 一、首先登錄任何頁面,獲取cookie i1 = requests.get(url="http://dig.chouti.com/help/service") i1_cookies = i1.cookies.get_dict() # ## 二、用戶登錄,攜帶上一次的cookie,後臺對cookie中的 gpsd 進行受權 i2 = requests.post( url="http://dig.chouti.com/login", data={ 'phone': "86178#########", 'password': "xxooxxoo", 'oneMonth': "" }, cookies=i1_cookies ) # ## 三、點贊(只須要攜帶已經被受權的gpsd便可) gpsd = i1_cookies['gpsd'] i3 = requests.post( url="http://dig.chouti.com/link/vote?linksId=8589523", cookies={'gpsd': gpsd} ) print(i3.text) """ # ############## 方式二 ############## """ import requests session = requests.Session() i1 = session.get(url="http://dig.chouti.com/help/service") i2 = session.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589523" ) print(i3.text) """
GitHub
import requests from bs4 import BeautifulSoup # ############## 方式一 ############## # # # 1. 訪問登錄頁面,獲取 authenticity_token # i1 = requests.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 攜帶authenticity_token和用戶名密碼等信息,發送用戶驗證 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "XXXXX@163.com", # 'password': 'xxoo' # } # # i2 = requests.post('https://github.com/session', data=form_data, cookies=c1) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = requests.get('https://github.com/settings/repositories', cookies=c1) # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "項目:%s(%s); 項目路徑:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp) # ############## 方式二 ############## # session = requests.Session() # # 1. 訪問登錄頁面,獲取 authenticity_token # i1 = session.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 攜帶authenticity_token和用戶名密碼等信息,發送用戶驗證 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "XXXXX@163.com", # 'password': 'xxoo' # } # # i2 = session.post('https://github.com/session', data=form_data) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = session.get('https://github.com/settings/repositories') # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "項目:%s(%s); 項目路徑:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp)
知乎
import time import requests from bs4 import BeautifulSoup session = requests.Session() i1 = session.get( url='https://www.zhihu.com/#signin', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup1 = BeautifulSoup(i1.text, 'lxml') xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'}) xsrf = xsrf_tag.get('value') current_time = time.time() i2 = session.get( url='https://www.zhihu.com/captcha.gif', params={'r': current_time, 'type': 'login'}, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', }) with open('zhihu.gif', 'wb') as f: f.write(i2.content) captcha = input('請打開zhihu.gif文件,查看並輸入驗證碼:') form_data = { "_xsrf": xsrf, 'password': 'xxooxxoo', "captcha": 'captcha', 'email': '##########@163.com' } i3 = session.post( url='https://www.zhihu.com/login/email', data=form_data, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) i4 = session.get( url='https://www.zhihu.com/settings/profile', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup4 = BeautifulSoup(i4.text, 'lxml') tag = soup4.find(id='rename-section') nick_name = tag.find('span',class_='name').string print(nick_name)
拉勾網:
import requests # 第一步:訪問登錄頁,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 一、請求url:https://passport.lagou.com/login/login.html # 二、請求方法:GET # 三、請求頭: # User-agent r1 = requests.get('https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] print(X_Anti_Forge_Token, X_Anti_Forge_Code) # print(r1.cookies.get_dict()) # 第二步:登錄 # 一、請求url:https://passport.lagou.com/login/login.json # 二、請求方法:POST # 三、請求頭: # cookie # User-agent # Referer:https://passport.lagou.com/login/login.html # X-Anit-Forge-Code:53165984 # X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78 # X-Requested-With:XMLHttpRequest # 四、請求體: # isValidate:true # username:17821281271 # password:ab18d270d7126ea65915c50288c22c0d # request_form_verifyCode:'' # submit:'' r2 = requests.post( 'https://passport.lagou.com/login/login.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ "isValidate": True, 'username': '17821281271', 'password': 'ab18d270d7126ea65915c50288c22c0d', 'request_form_verifyCode': '', 'submit': '' }, cookies=r1.cookies.get_dict() ) print(r2.text)
博客園
import re import json import base64 import rsa import requests def js_encrypt(text): b64der = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB' der = base64.standard_b64decode(b64der) pk = rsa.PublicKey.load_pkcs1_openssl_der(der) v1 = rsa.encrypt(bytes(text, 'utf8'), pk) value = base64.encodebytes(v1).replace(b'\n', b'') value = value.decode('utf8') return value session = requests.Session() i1 = session.get('https://passport.cnblogs.com/user/signin') rep = re.compile("'VerificationToken': '(.*)'") v = re.search(rep, i1.text) verification_token = v.group(1) form_data = { 'input1': js_encrypt('wptawy'), 'input2': js_encrypt('asdfasdf'), 'remember': False } i2 = session.post(url='https://passport.cnblogs.com/user/signin', data=json.dumps(form_data), headers={ 'Content-Type': 'application/json; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'VerificationToken': verification_token} ) i3 = session.get(url='https://i.cnblogs.com/EditDiary.aspx') print(i3.text)