先查看首頁拿到cookie,而後登錄要攜帶首頁拿到的 cookie 才能夠經過驗證html
"""""" # ################################### 示例一:爬取數據(攜帶請起頭) ################################### """ import requests from bs4 import BeautifulSoup r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) soup = BeautifulSoup(r1.text,'html.parser') content_list = soup.find(name='div',attrs={"id":"content-list"}) item_list = content_list.find_all(name='div',attrs={'class':'item'}) for item in item_list: a = item.find(name='a',attrs={'class':'show-content color-chag'}) print(a.text.strip()) """ # ################################### 示例二:登錄點贊 ################################### """ import requests # 1. 查看首頁 r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) # 2. 提交用戶名和密碼 r2 = requests.post( url='https://dig.chouti.com/login', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, data={ 'phone':'8613121758648', 'password':'woshiniba', 'oneMonth':1 }, cookies=r1.cookies.get_dict() # 套路 正經常使用戶必然會先訪問首頁而後再登錄 # 若是你直接登錄必然是爬蟲,所以設計在第一次訪問首頁的時候先建立cookie 而且返回了回去 # 而且要求你第二次訪問的時候要帶着這個 cookie ) # 3. 點贊 r3 = requests.post( url='https://dig.chouti.com/link/vote?linksId=20435396', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, cookies=r1.cookies.get_dict() ) print(r3.text) """ # ############## 方式二 session 方式 ############## """ # 用 session 自動封裝好 cookie 不用在之後本身攜帶 import requests session = requests.Session() i1 = session.get(url="http://dig.chouti.com/help/service") i2 = session.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589523" ) print(i3.text) """
請求頭中存在自定義的驗證字段,要想辦法拿到才能夠正確爬取,以及 Referer 的使用python
import re import requests """ 密碼加密了的時候 找js 經過 python 實現加密方式 直接把加密後的密文拿來用 """ r1 = requests.get( url='https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) """ 有兩個奇怪的東西,是網站的防護機制 這兩個數據必然是對方發給咱們的 要不在響應頭裏面,要不在響應體裏面 響應頭看不到。那就去響應體裏面找。 """ # 由於不是寫在標籤裏面的。只能用正則來拿了 X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # print(X_Anti_Forge_Token, X_Anti_Forge_Code) r2 = requests.post( url='https://passport.lagou.com/login/login.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Anit-Forge-Code':X_Anti_Forge_Code, 'X-Anit-Forge-Token':X_Anti_Forge_Token, 'Referer': 'https://passport.lagou.com/login/login.html', # 上一次請求地址是什麼?不少網站會要求帶着個才能夠繼續 }, data={ "isValidate": True, 'username': '15131255089', 'password': 'ab18d270d7126ea65915c50288c22c0d', # 直接發密文了 'request_form_verifyCode': '', 'submit': '' }, cookies=r1.cookies.get_dict() ) print(r2.text)
scrf_token 的驗證git
"""""" # ################################### 示例三:自動登陸GitHub ################################### # 1. GET,訪問登陸頁面 """ - 去HTML中找隱藏的Input標籤獲取csrf token - 獲取cookie """ # 2. POST,用戶名和密碼 """ - 發送數據: - csrf - 用戶名 - 密碼 - 攜帶cookie """ # 3. GET,訪問https://github.com/settings/emails """ - 攜帶 cookie """ import requests from bs4 import BeautifulSoup # ########################################################## # 訪問登錄頁面,獲取 authenticity_token i1 = requests.get( url='https://github.com/login' ) soup1 = BeautifulSoup(i1.text, features='lxml') tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) authenticity_token = tag.get('value') # authenticity_token 拿到 c1 = i1.cookies.get_dict() i1.close() # 攜帶authenticity_token和用戶名密碼等信息,發送用戶驗證 form_data = { "authenticity_token": authenticity_token, # 放在請求體中發過去 "utf8": "", "commit": "Sign in", "login": "", 'password': '' } i2 = requests.post( url='https://github.com/session', data=form_data, cookies=c1 ) c2 = i2.cookies.get_dict() c1.update(c2) # 將兩次的 cookie 整合一塊兒 i3 = requests.get('https://github.com/settings/repositories', cookies=c1) soup3 = BeautifulSoup(i3.text, features='lxml') list_group = soup3.find(name='div', class_='listgroup') from bs4.element import Tag for child in list_group.children: if isinstance(child, Tag): project_tag = child.find(name='a', class_='mr-1') size_tag = child.find(name='small') temp = "項目:%s(%s); 項目路徑:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) print(temp)
user-agent
referer
host
cookie
特殊請起頭,查看上一次請求獲取內容。github
'X-Anit-Forge-Code':...
'X-Anit-Forge-Token':...
- 原始數據
- 原始數據 + token - 密文 - 找算法 - 使用密文
- post登陸獲取cookie,之後攜帶cookie
- get獲取未受權cookie,post登陸攜帶cookie去受權,之後攜帶cookie