# ============================第一步訪問登陸界面====================================== import requests r1 =requests.get( url="https://passport.lagou.com/login/login.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/67.0.3396.87 Safari/537.36",} ) r1_cookie_dict =r1.cookies.get_dict() print(r1.text) print("r1-cookie:===>",r1_cookie_dict)
打印結果:html
# =======================================第二步、去登陸拉鉤網================================================
import re
token = re.findall("X_Anti_Forge_Token = '(.*)';",r1.text)[0]
code =re.findall("X_Anti_Forge_Code = '(.*)';",r1.text)[0]
print(token)
print(code)
r2 =requests.post(
url="https://passport.lagou.com/login/login.json",
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
"X-Requested-With":"XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Connection": "keep-alive",
"Content-Length": "111",
"Origin": "https: // passport.lagou.com",
"Referer": "https://passport.lagou.com/login/login.html",
"X-Anit-Forge-Code":code,
"X-Anit-Forge-Token" :token ,
},
data={"isValidate": "true",
"username": "",
"password": "4d541689997b5ff6ac90a350b5dd6693",
"request_form_verifyCode":"",
"submit":""
},
cookies= r1_cookie_dict
)
print(r2.text)
打印結果前端
import requests r3 =requests.get( url="https://www.lagou.com/mycenter/invitation.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Host": "www.lagou.com", "Upgrade-Insecure-Requests": "1" }, cookies= r1_cookie_dict #cookies不正確.print出來的不正確. ) print(r3.text)
import requests r3 =requests.get( url="https://passport.lagou.com/grantServiceTicket/grant.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Host": "passport.lagou.com", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Referer": "https://passport.lagou.com/login/login.html?ts=1532004536388&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=F241DF2A40C183BA91C33BA6604912F0", }, cookies= r1_cookie_dict, allow_redirects =False #把重定向關掉. ) r3_cookie_dict =r3.cookies.get_dict() print(r3.text) print(r3.cookies.get_dict())
打印結果:vue
# =======================================第四步、Action發請求================================================ all_cookies_dict.update() import requests r4 =requests.get( url="https://www.lagou.com/?action=grantST&ticket=ST-f6c670b8a6104480a96cd835d80a8db8", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Host": "www.lagou.com", "Referer": "https://passport.lagou.com/login/login.html?ts=1532005741245&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=ED6DE46236FC2638697A5ECC080822F7", }, cookies= all_cookies_dict, allow_redirects =False ) r4_cookie_dict =r4.cookies.get_dict() print("r4===>",r4.text)
# ##################################### 第五步:獲取認證信息 ##################################### r5 = requests.get( url=r4.headers['Location'], headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Referer':'https://passport.lagou.com/login/login.html', 'Host':'www.lagou.com', 'Upgrade-Insecure-Requests':'1', }, cookies=all_cookie_dict, allow_redirects=False ) r5_cookie_dict = r5.cookies.get_dict() all_cookie_dict.update(r5_cookie_dict) print(r5.headers['Location'])
第六次請求python
# ##################################### 第六步:個人邀請 ##################################### r = requests.get( url='https://www.lagou.com/mycenter/invitation.html', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Host':'www.lagou.com', 'Upgrade-Insecure-Requests':'1', 'Pragma':'no-cache', }, cookies=all_cookie_dict ) print('wupeiqi' in r.text)
# ##################################### 第七步 ##################################### r7 = requests.get( url=r6.headers['Location'], headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Referer':'https://passport.lagou.com/login/login.html', 'Host':'www.lagou.com', 'Upgrade-Insecure-Requests':'1', }, cookies=all_cookie_dict, allow_redirects=False ) r7_cookie_dict = r7.cookies.get_dict() all_cookie_dict.update(r7_cookie_dict)
# ##################################### 第九步:查看我的信息 ##################################### r9 = requests.put( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Host':'gate.lagou.com', 'Origin':'https://account.lagou.com', 'Referer':'https://account.lagou.com/v2/account/userinfo.html', 'X-L-REQ-HEADER':'{deviceType:1}', 'X-Anit-Forge-Code':r8_response_json.get('submitCode'), 'X-Anit-Forge-Token':r8_response_json.get('submitToken'), 'Content-Type':'application/json;charset=UTF-8', }, json={"userName":"wupeiqi999","sex":"MALE","portrait":"images/myresume/default_headpic.png","positionName":"...","introduce":"...."}, cookies=all_cookie_dict ) print(r9.text)
3. requests模塊
參數:
url
params
headers
cookies
data
示例:
request.post(
data={
user:'alex',
pwd:'sb'
}
)
user=alex&pwd=sb
chrome: formdata
json
示例:
request.post(
json={
user:'alex',
pwd:'sb'
}
)
'{"user":"alex","pwd":"sb"}'
chrome: request payload git
s10day112
內容回顧:
第一部分:爬蟲相關
1. 談談你對http協議的理解?
規範:
1. Http請求收發數據的格式
GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n
POST /index/ http1.1/r/nhost:xxx.com/r/n/r/nuser=xxx
2. 短鏈接(無狀態)
一次請求一次響應以後,就斷開鏈接
3. 基於TCP協議之上
sk = socket()
sk.send('GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n')
常見請求頭有哪些?
host
content-type
user-agent
cookies
referer,上一次請求地址
常見的請求方法有哪些?
GET
POST
DELETE
PUT
PATCH
OPTIONS
2. requests
用於僞造瀏覽器發送請求
參數:
- url
- headers
- data
- cookies
響應:
- content
- text
- encoding='gbk'
- headers
- cookies.get_dict()
3. bs
用於解析HTML格式的字符串
方法和屬性:
- find
- find_all
- attrs
- get
- text
4. 套路
- 汽車之家
- 抽屜新聞:攜帶user-agent
- 登陸抽屜:第一訪問保留cookie,登陸時須要再次攜帶;
- 自動登陸github:獲取csrf_token,到底攜帶那一個cookie
補充:自動登陸github
第二部分:路飛相關
1. 公司的組織架構?
開發:
- 村長
- 前端姑娘
- 濤
- 雲(產品+開發)
UI:1人
測試:1人
運維:1人
運營:2人
銷售:3人
班主任:1人
全職助教:2人
人事/財務:老男孩共享
2. 項目架構
- 管理後臺(1)
- 權限
- xadmin
- 導師後臺(1)
- 權限
- xadmin
- 主站(1+1+0.5+1)
- restful api
- vue.js
如今開發:題庫系統
3. 涉及技術點:
- django
- django rest framework
- vue.js
- 跨域cors
- redis
- 支付寶支付
- 視頻播放
- CC視頻
- 保利
- 微信消息推送
- 已認證的服務號
- 發送模板消息
- content-type
今日內容:
- 拉勾網
- 抖音
- requests
- bs4
- 初識scrapy框架
內容詳細:
1.拉勾網
- Token和Code存在頁面上,自定義請求頭上
- 重定向:
- 響應頭的Location中獲取要重定向的地址
- 本身去處理
- 請求發送時須要攜帶上次請求的code和token
原則:
- 徹底模擬瀏覽器的行爲
2. 爬抖音視頻
3. requests模塊
參數:
url
params
headers
cookies
data
示例:
request.post(
data={
user:'alex',
pwd:'sb'
}
)
user=alex&pwd=sb
chrome: formdata
json
示例:
request.post(
json={
user:'alex',
pwd:'sb'
}
)
'{"user":"alex","pwd":"sb"}'
chrome: request payload
allow_redirecs
stream
files
requests.post(
url='xxx',
files={
'f1': open('readme', 'rb')
}
)
auth
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('admin', 'admin'))
print(ret.text)
timeout
ret = requests.get('http://google.com/', timeout=1)
ret = requests.get('http://google.com/', timeout=(5, 1))
proxies
proxies = {
"http": "61.172.249.96:80",
"https": "http://61.185.219.126:3128",
}
# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}
ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxies)
print(ret.headers)
from requests.auth import HTTPProxyAuth
auth = HTTPProxyAuth('username', 'mypassword')
r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
證書相關:
cert
verify
session:自動管理cookie和headers(不建議使用)
import requests
session = requests.Session()
i1 = session.get(url="http://dig.chouti.com/help/service")
i2 = session.post(
url="http://dig.chouti.com/login",
data={
'phone': "8615131255089",
'password': "xxooxxoo",
'oneMonth': ""
}
)
i3 = session.post(
url="http://dig.chouti.com/link/vote?linksId=8589523"
)
print(i3.text)
4. bs4
參考示例:https://www.cnblogs.com/wupeiqi/articles/6283017.html
預習:
1. 安裝scrapy
https://www.cnblogs.com/wupeiqi/articles/6229292.html
a. 下載twisted
http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
b. 安裝wheel
pip3 install wheel
c. 安裝twisted
pip3 install Twisted‑18.7.0‑cp36‑cp36m‑win_amd64.whl
d. 安裝pywin32
pip3 install pywin32
e. 安裝scrapy
pip3 install scrapy
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs)