day 112天,爬蟲(拉鉤網,鬥音,GitHub)次日

提早準備工做、安裝準備工做(day3用)

 1. 安裝scrapy 11
https://www.cnblogs.com/wupeiqi/articles/6229292.html

a. 下載twisted 
http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted

b. 安裝wheel 
pip3 install wheel

c. 安裝twisted 

pip3 install Twisted‑18.7.0‑cp36‑cp36m‑win_amd64.whl

d. 安裝pywin32
pip3 install pywin32


e. 安裝scrapy 
pip3 install scrapy 
 
 
https://www.cnblogs.com/wupeiqi/articles/6283017.html   武沛齊老師博客。

 

1、訪問登陸界面

# ============================第一步訪問登陸界面======================================
import requests
r1 =requests.get(
    url="https://passport.lagou.com/login/login.html",
    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\
    Chrome/67.0.3396.87 Safari/537.36",}
                )
r1_cookie_dict =r1.cookies.get_dict()
print(r1.text)
print("r1-cookie:===>",r1_cookie_dict)

  打印結果:html

 

 

2、登陸界面,登陸成功

# =======================================第二步、去登陸拉鉤網================================================

import re

token =  re.findall("X_Anti_Forge_Token = '(.*)';",r1.text)[0]
code =re.findall("X_Anti_Forge_Code = '(.*)';",r1.text)[0]
print(token)
print(code)

r2 =requests.post(
    url="https://passport.lagou.com/login/login.json",

    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
    "X-Requested-With":"XMLHttpRequest",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Connection": "keep-alive",
    "Content-Length": "111",
    "Origin": "https: // passport.lagou.com",
    "Referer": "https://passport.lagou.com/login/login.html",
    "X-Anit-Forge-Code":code,
    "X-Anit-Forge-Token" :token ,
    },
    data={"isValidate": "true",
            "username": "",
            "password": "4d541689997b5ff6ac90a350b5dd6693",
            "request_form_verifyCode":"",
            "submit":""
          },
    cookies= r1_cookie_dict
)


print(r2.text) 

打印結果前端

 

3、登陸邀請(invitation)界面

 

import  requests

r3 =requests.get(
    url="https://www.lagou.com/mycenter/invitation.html",

    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        "Host": "www.lagou.com",
        "Upgrade-Insecure-Requests": "1"
    },
 cookies= r1_cookie_dict #cookies不正確.print出來的不正確. )
print(r3.text)

打印結果:

登陸失敗。 因此 第三步的流程不對,應該嘗試其餘的流程

從新登陸後會發現有個以下的請求

 重定向到新的網站

  

 

 又重定向新的網址

又重定向新的網址

重定向後:

 

 

 

 第三步; grant 登陸

 

 

import  requests

r3 =requests.get(
    url="https://passport.lagou.com/grantServiceTicket/grant.html",

    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        "Host": "passport.lagou.com",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive",
        "Referer": "https://passport.lagou.com/login/login.html?ts=1532004536388&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=F241DF2A40C183BA91C33BA6604912F0",
    },
    cookies= r1_cookie_dict,
 allow_redirects =False #把重定向關掉. )
r3_cookie_dict =r3.cookies.get_dict()
print(r3.text)
print(r3.cookies.get_dict()) 

 

  打印結果:vue

 

 

 

第四步, 發送action 請求

# =======================================第四步、Action發請求================================================
all_cookies_dict.update()
import  requests
r4 =requests.get(
    url="https://www.lagou.com/?action=grantST&ticket=ST-f6c670b8a6104480a96cd835d80a8db8",

    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        "Host": "www.lagou.com",
        "Referer": "https://passport.lagou.com/login/login.html?ts=1532005741245&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=ED6DE46236FC2638697A5ECC080822F7",
    },
    cookies= all_cookies_dict,
    allow_redirects =False
)
r4_cookie_dict =r4.cookies.get_dict()
print("r4===>",r4.text)

  

 

第五次請求 

 

# ##################################### 第五步:獲取認證信息 #####################################
r5 = requests.get(
    url=r4.headers['Location'],
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer':'https://passport.lagou.com/login/login.html',
        'Host':'www.lagou.com',
        'Upgrade-Insecure-Requests':'1',
    },
    cookies=all_cookie_dict,
    allow_redirects=False

)
r5_cookie_dict = r5.cookies.get_dict()
all_cookie_dict.update(r5_cookie_dict)

print(r5.headers['Location'])

  

第六次請求python

# ##################################### 第六步:個人邀請 #####################################
r = requests.get(
    url='https://www.lagou.com/mycenter/invitation.html',
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Host':'www.lagou.com',
        'Upgrade-Insecure-Requests':'1',
        'Pragma':'no-cache',
    },
    cookies=all_cookie_dict
)
print('wupeiqi' in r.text)

最後登陸成功:

 

第七步:

# ##################################### 第七步 #####################################
r7 = requests.get(
    url=r6.headers['Location'],
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer':'https://passport.lagou.com/login/login.html',
        'Host':'www.lagou.com',
        'Upgrade-Insecure-Requests':'1',
    },
    cookies=all_cookie_dict,
    allow_redirects=False

)
r7_cookie_dict = r7.cookies.get_dict()
all_cookie_dict.update(r7_cookie_dict)

  

 第八步:

# ##################################### 第九步:查看我的信息 #####################################

r9 = requests.put(
    url='https://gate.lagou.com/v1/neirong/account/users/0/',
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Host':'gate.lagou.com',
        'Origin':'https://account.lagou.com',
        'Referer':'https://account.lagou.com/v2/account/userinfo.html',
        'X-L-REQ-HEADER':'{deviceType:1}',
        'X-Anit-Forge-Code':r8_response_json.get('submitCode'),
        'X-Anit-Forge-Token':r8_response_json.get('submitToken'),
        'Content-Type':'application/json;charset=UTF-8',
    },
    json={"userName":"wupeiqi999","sex":"MALE","portrait":"images/myresume/default_headpic.png","positionName":"...","introduce":"...."},
    cookies=all_cookie_dict
)

print(r9.text)

  

 

爬蟲 抖音視頻

 

3. requests模塊
參數:
url
params
headers
cookies
data
示例:
request.post(
data={
user:'alex',
pwd:'sb'
}
)

user=alex&pwd=sb

chrome: formdata
json
示例:
request.post(
json={
user:'alex',
pwd:'sb'
}
)

'{"user":"alex","pwd":"sb"}'

chrome: request payload git

 

 

s10day112 

內容回顧:
	第一部分:爬蟲相關
		1. 談談你對http協議的理解?
			規範:
				1. Http請求收發數據的格式
					GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n
					POST /index/ http1.1/r/nhost:xxx.com/r/n/r/nuser=xxx
					
				2. 短鏈接(無狀態)
					一次請求一次響應以後,就斷開鏈接
					
				3. 基於TCP協議之上
					sk = socket()
					sk.send('GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n')
			常見請求頭有哪些?
				host 
				content-type 
				user-agent
				cookies
				referer,上一次請求地址
			常見的請求方法有哪些?
				GET
				POST 
				DELETE
				PUT 
				PATCH
				OPTIONS
		2. requests 
			用於僞造瀏覽器發送請求
			參數:
				- url 
				- headers
				- data 
				- cookies 
			響應:
				- content 
				- text 
				- encoding='gbk'
				- headers 
				- cookies.get_dict()
				
		3. bs  
			用於解析HTML格式的字符串 
			方法和屬性:
				- find 
				- find_all 
				- attrs
				- get 
				- text 
				
		4. 套路 
			- 汽車之家
			- 抽屜新聞:攜帶user-agent 
			- 登陸抽屜:第一訪問保留cookie,登陸時須要再次攜帶;
			- 自動登陸github:獲取csrf_token,到底攜帶那一個cookie

			
			補充:自動登陸github
			
	第二部分:路飛相關
		1. 公司的組織架構?
			開發:
				- 村長
				- 前端姑娘
				- 濤
				- 雲(產品+開發)
			UI:1人
			測試:1人
			運維:1人
			運營:2人
			銷售:3人
			班主任:1人
			全職助教:2人
			人事/財務:老男孩共享
			
		2. 項目架構
			- 管理後臺(1)
				- 權限
				- xadmin
			- 導師後臺(1)
				- 權限
				- xadmin 
			- 主站(1+1+0.5+1)
				- restful api 
				- vue.js 
			
			如今開發:題庫系統

		3. 涉及技術點:
			- django 
			- django rest framework 
			- vue.js 
			- 跨域cors
			- redis 
			- 支付寶支付
			- 視頻播放
				- CC視頻
				- 保利 
			- 微信消息推送
				- 已認證的服務號
				- 發送模板消息 
			- content-type 
		
		
今日內容:
	- 拉勾網
	- 抖音 
	- requests
	- bs4 
	- 初識scrapy框架
	
	
內容詳細:
	1.拉勾網
		- Token和Code存在頁面上,自定義請求頭上
		- 重定向:
			- 響應頭的Location中獲取要重定向的地址
			- 本身去處理
		- 請求發送時須要攜帶上次請求的code和token 
		
		原則:
			- 徹底模擬瀏覽器的行爲
	
	2. 爬抖音視頻 
	
	3. requests模塊 
		參數:	
			url 
			params 
			headers 
			cookies 
			data 
				示例:
					request.post(
						data={
							user:'alex',
							pwd:'sb'
						}
					)
					
					user=alex&pwd=sb 
				
				chrome: formdata
			json 
				示例:
					request.post(
						json={
							user:'alex',
							pwd:'sb'
						}
					)
					
					'{"user":"alex","pwd":"sb"}'
	
				chrome: request payload 
			allow_redirecs
			stream
	
			files 
				requests.post(
					url='xxx',
					files={
						'f1': open('readme', 'rb')
					}
				)
			
			auth
				from requests.auth import HTTPBasicAuth, HTTPDigestAuth

				ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('admin', 'admin'))
				print(ret.text)
	
			timeout 
				ret = requests.get('http://google.com/', timeout=1)
				
				ret = requests.get('http://google.com/', timeout=(5, 1))
			proxies
				proxies = {
					"http": "61.172.249.96:80",
					"https": "http://61.185.219.126:3128",
				}
				# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}
				
				ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxies)
				print(ret.headers)
				
				
				from requests.auth import HTTPProxyAuth
				auth = HTTPProxyAuth('username', 'mypassword')
				
				r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
				
			證書相關:
				cert
				verify
				
		session:自動管理cookie和headers(不建議使用)
			import requests

			session = requests.Session()
			i1 = session.get(url="http://dig.chouti.com/help/service")
			i2 = session.post(
				url="http://dig.chouti.com/login",
				data={
					'phone': "8615131255089",
					'password': "xxooxxoo",
					'oneMonth': ""
				}
			)
			i3 = session.post(
				url="http://dig.chouti.com/link/vote?linksId=8589523"
			)
			print(i3.text)
				
	4. bs4 

		參考示例:https://www.cnblogs.com/wupeiqi/articles/6283017.html
	
	
預習:
	1. 安裝scrapy 
		https://www.cnblogs.com/wupeiqi/articles/6229292.html
		
		a. 下載twisted 
			http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
		
		b. 安裝wheel 
			pip3 install wheel
			
		c. 安裝twisted 
			
			pip3 install Twisted‑18.7.0‑cp36‑cp36m‑win_amd64.whl
			
		d. 安裝pywin32
			pip3 install pywin32
			
			
		e. 安裝scrapy 
			pip3 install scrapy 

  

 

 

requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs) 
相關文章
相關標籤/搜索