爬蟲基礎01

時間 2019-11-11

原文原文鏈接

寫在前面

　　　　逆水行舟html

  1 爬蟲
  2     
  3     - 基本操做
  4         概要：
  5             - 發送Http請求，Python Http請求，requests
  6             - 提取指定信息，Python 正則表達式，beautifulsoup
  7             - 數據持久化，
  8         
  9         Python兩個模塊
 10             - requests
 11             - beautifulsoup
 12         
 13         Http請求相關知識
 14             - 請求：
 15                 請求頭
 16                     - cookie
 17                 請求體
 18                     - 發送內容
 19                     
 20             - 響應：
 21                 響應頭
 22                     - 瀏覽器讀取
 23                 響應體
 24                     - 看到的內容
 25             
 26             特殊：
 27                 - cookie
 28                 - csrftoken
 29                 - content-type:
 30                 
 31                     content-type:application/url-form....
 32                     name=alex&age=18
 33                     
 34                     content-type:application/json
 35                     {name:'alex',age:18}
 36     - 性能相關
 37         - 串行： 1我的，一個任務一個任務，空餘時間，玩。
 38         - 線程： 10我的，一個任務一個任務，空餘時間，玩。
 39         - 進程： 10個家庭，一個任務一個任務，空餘時間，玩。
 40         - 【協程】異步非阻塞：1我的，充分利用時間。
 41     
 42     - scrapy框架
 43         - 規則
 44         
 45     - redis-scrapy組件
 46     
 47     
 48     
 49 內容詳細：
 50     - 基本操做，python僞造瀏覽器發送請求並或者指定內容
 51     
 52         pip3 install requests
 53         response = requests.get('http://www.baidu.com')
 54         response.text
 55         
 56         
 57         pip3 install beautifulsoup4
 58         from bs4 import Beautifulsoup
 59         
 60         soup = Beautifulsoup(response.text,'html.parser')
 61         soup.find(name='h3',attrs={'class':'t'})
 62         soup.find_all(name='h3')
 63         
 64         示例：爬取汽車之家新聞
 65         
 66         
 67     - 模塊
 68     
 69         requests
 70             GET:
 71                 requests.get(url="http://www.oldboyedu.com")
 72                 # data="http GET / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 73                 
 74                 requests.get(url="http://www.oldboyedu.com/index.html?p=1")
 75                 # data="http GET /index.html?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 76                 
 77                 requests.get(url="http://www.oldboyedu.com/index.html",params={'p':1})
 78                 # data="http GET /index.html?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 79             
 80             POST:
 81                 requests.post(url="http://www.oldboyedu.com",data={'name':'alex','age':18}) # 默認請求頭：url-formend....
 82                 data="http POST / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\nname=alex&age=18"
 83                 
 84                 
 85                 requests.post(url="http://www.oldboyedu.com",json={'name':'alex','age':18}) # 默認請求頭：application/json
 86                 data="http POST / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n{"name": "alex", "age": 18}"
 87 
 88                 
 89                 requests.post(
 90                     url="http://www.oldboyedu.com",
 91                     params={'p':1},
 92                     json={'name':'alex','age':18}
 93                 ) # 默認請求頭：application/json
 94                 
 95                 data="http POST /?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n{"name": "alex", "age": 18}"
 96                 
 97                 
 98                 補充：
 99                     request.body,永遠有值
100                     request.POST，可能沒有值
101                     
102                 
103         beautifulsoup
104             soup = beautifulsoup('HTML格式字符串','html.parser')
105             
106             tag = soup.find(name='div',attrs={})
107             tags = soup.find_all(name='div',attrs={})
108             
109             
110             tag.find('h3').text
111             tag.find('h3').get('屬性名稱')
112             tag.find('h3').attrs
113     
114     
115         HTTP請求：
116             GET請求：
117                 data="http GET /index?page=1 http1.1\r\nhost:baidu.com\r\n....\r\n\r\n"
118                 
119                 
120             POST請求：
121                 data="http POST /index?page=1 http1.1\r\nhost:baidu.com\r\n....\r\n\r\nname=alex&age=18"
122                 
123                 
124             socket.sendall(data)
125     
126     
127         示例【github和抽屜】：任何一個不用驗證碼的網站，經過代碼自動登陸
128             
129             1. 按理說
130                 r1 = requests.get(url='https://github.com/login')
131                 s1 = beautifulsoup(r1.text,'html.parser')
132                 val = s1.find(attrs={'name':'authenticity_token'}).get('value')
133                 
134                 r2 = requests.post(
135                         url= 'https://github.com/session',
136                         data={
137                             'commit': 'Sign in',
138                             'utf8': '✓',
139                             'authenticity_token': val,
140                             'login':'xxxxx',
141                             'password': 'xxxx',
142                             
143                         }
144                     )
145                     
146                 r2_cookie_dict = r2.cookies.get_dict() # {'session_id':'asdfasdfksdfoiuljksdf'}
147         
148                 保存登陸狀態，查看任意URL
149                 
150                 r3 = requests.get(
151                     url='xxxxxxxx',
152                     cookies=r2_cookie_dict
153                 )
154         
155                 print(r3.text) # 登陸成功以後，能夠查看的頁面
156                 
157             2. 不按理說
158                 r1 = requests.get(url='https://github.com/login')
159                 s1 = beautifulsoup(r1.text,'html.parser')
160                 val = s1.find(attrs={'name':'authenticity_token'}).get('value')
161                 # cookie返回給你
162                 r1_cookie_dict = r1.cookies.get_dict()
163                 
164                 
165                 r2 = requests.post(
166                         url= 'https://github.com/session',
167                         data={
168                             'commit': 'Sign in',
169                             'utf8': '✓',
170                             'authenticity_token': val,
171                             'login':'xxxxx',
172                             'password': 'xxxx',
173                             
174                         },
175                         cookies=r1_cookie_dict
176                     )
177                 # 受權
178                 r2_cookie_dict = r2.cookies.get_dict() # {}
179         
180         
181         
182                 保存登陸狀態，查看任意URL
183                 
184                 r3 = requests.get(
185                     url='xxxxxxxx',
186                     cookies=r1_cookie_dict
187                 )
188         
189                 print(r3.text) # 登陸成功以後，能夠查看的頁面
190             
191             
192     - requests
193         """
194         1. method
195         2. url
196         3. params
197         4. data
198         5. json
199         6. headers
200         7. cookies
201         8. files
202         9. auth
203         10. timeout
204         11. allow_redirects
205         12. proxies
206         13. stream
207         14. cert
208         ================ session,保存請求相關信息（不推薦）===================
209         import requests
210 
211         session = requests.Session()
212 
213         i1 = session.get(url="http://dig.chouti.com/help/service")
214         i2 = session.post(
215             url="http://dig.chouti.com/login",
216             data={
217                 'phone': "8615131255089",
218                 'password': "xxooxxoo",
219                 'oneMonth': ""
220             }
221         )
222         i3 = session.post(
223             url="http://dig.chouti.com/link/vote?linksId=8589523"
224         )
225         print(i3.text)
226 
227         """
228     - beautifulsoup
229         - find()
230         - find_all()
231         - get()
232         - attrs
233         - text
234         
235 內容：
236     1. 示例：汽車之家
237     2. 示例：github和chouti
238     3. requests和beautifulsoup
239     4. 輪詢和長輪詢
240     5. Django
241         request.POST
242         request.body
243         
244         # content-type:xxxx
245         
246 做業：web微信
247       功能：
248         1. 二維碼顯示
249         2. 長輪詢：check_login
250         3. 
251             - 檢測是否已經掃碼
252             - 掃碼以後201，頭像： base64:.....
253             - 點擊確認200，response.text     redirect_ur=....
254         4. 可選，獲取最近聯繫人信息
255         
256 安裝：
257     twsited
258     scrapy框架
259     
260     
261

武Sir - 筆記

參考：http://www.cnblogs.com/wupeiqi/articles/6283017.htmlpython

爬蟲相關
	- 基本操做
		- 概要
			- 發送http請求	requests模塊
			- 提取指定信息 	正則	Beautifulsoup模塊
			- 數據持久化

		- Python的2個模塊
			- requests
			- Beautifulsoup

		- Http請求相關知識
			- 請求
				- 請求頭 
					- cookie
				- 請求體 
					- 發送的內容
			- 響應 
				- 響應頭 
					- 瀏覽器讀取
				- 響應體
					- 看到的內容

			- 特殊
				- cookie
				- csrf_token
				- content-type 用來指定客戶端按照哪一種格式進行解析


	- 性能相關
		- 進程
		- 線程
		- 協程

		- 【協程】異步非阻塞：充分利用系統資源


	- scrapy框架
		- 學習scrapy的規則


	- redis&scrapy組件：完成一個簡單的分佈式爬蟲



內容詳細
	- 基本操做	Python僞造瀏覽器發送請求

		pip3 install requests
		pip3 install Beautifulsoup4

		import requests
		from bs4 import BeautifulSoup


		response = requests.get("http://www.baidu.com")
		response.text  ->  網頁內容

		soup = Beautifulsoup(response.text,'html.parse')

		# 從上到下第一個 <h3 class='t'> 標籤
		soup.find(name='h3',attrs={'class':'t'})
		# 查找所有 <h3>標籤
		soup.find_all(name='h3')

		...

	模塊
		requests
			response = requests.get(url='url路徑')
			# 解決亂碼問題
			response.encoding = response.apparent_encoding

			GET請求：
				requests.get(url='www.baidu.com')
				data = "http GET / ...."
				requests.get(url='www.baidu.com?page=1')
				data = "http GET page=1 ...."
				requests.get(url='www.baidu.com',params={'page':1})


			POST請求：
				requests.post(url='www.baidu.com',data={'name':'alex','age':18}) # 默認攜帶請求頭類型：application/x-www-form-urlencoded

				requests.post(url='www.baidu.com',json={'name':'alex','age':18}) # 默認攜帶請求頭類型：application/json

				# POST請求既能夠在請求體裏傳參，又能夠在url裏傳參
				requests.post(url='www.baidu.com',params={'page':1},json={'name':'alex','age':18})



				補充：
					django裏的 request.POST 裏的值是django根據請求體裏的數據轉換過來的
						因此，若是body裏的數據格式不對，那麼就轉換不了，致使request.POST裏面沒有值
					django裏的 request.body 裏永遠有值
					django裏的 request.POST 可能沒有值



		BeautifulSoup
			soup = BeautifulSoup('html格式字符串','html.parser')
			tag = soup.find(name='div',attrs={...})
			tag = soup.find_all(name='div',attrs={...})

			tag.find('h3').text
			tag.find('h3').content 
			tag.find('h3').get('屬性名稱')
			tag.find('h3').attrs['屬性名稱']







服務器端不能主動給客戶端發消息
可是websocket能夠

- 【輪詢】     	http協議，客戶端輪詢（每秒1次）請求服務端；一次請求，服務端收到後無論有沒有新消息都當即返回
- 【長輪詢】 	http協議，客戶端發來請求，服務器把客戶端給hang住，直到服務端收到新消息併發送給全部客戶端、才斷開鏈接；
				客戶端收到消息後，再當即發請求到服務端進行下一次hang住。
				hang住，有一個超時時間，web微信超時時間是25s
				應用：web微信
- 【WebSocket】	不是http協議，創建在tcp之上
				一次鏈接不斷開，雙工通道，能夠互相發送消息
				可是瀏覽器兼容性不太好，之後將會應用的更普遍




瀏覽器有同源策略
ajax發送跨域請求是接收不到結果的





http://www.cnblogs.com/wupeiqi/articles/6283017.html




#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests

requests.request()

requests.get(url='xxx')
# 本質上就是：
requests.request(method='get',url='xxx')

import json
requests.post(url='xxx',data={'name':'alex','age':18}) # content_type: application/x-www-form-urlencoded
requests.post(url='xxx',data="name=alex&age=18")   # content_type: application/x-www-form-urlencoded
# 不三不四
requests.post(url='xxx',data=json.dumps({'name':'alex','age':18}))  # content_type: application/x-www-form-urlencoded
# 利用headers參數重寫 Content_type
requests.post(url='xxx',data=json.dumps({'name':'alex','age':18}),headers={'Content_type':'application/json'})  # content_type: application/x-www-form-urlencoded
requests.post(url='xxx',json={'name':'alex','age':18})  # content_type: application/json


"""
1.method
2.url
3.params
4.data
5.json
6.headers
7.cookies

8.files
9.auth
10.timeout
11.allow_redirects
12.proxies
13.stream
14.cert

=================== session,保存請求相關信息  ==================
session = requests.Session()
session.get(url='xxx')
session.post(...)
"""

"""
8.files 用做文件上傳
"""
file_dict = {
    'f1': open('readme', 'rb')
}
requests.post(url='xxx',file=file_dict)
# 發送文件，定製文件名
# file_dict = {
#   'f1': ('test.txt', open('readme', 'rb'))
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

# 發送文件，定製文件名
# file_dict = {
#   'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)



"""
9.auth  基本認證    路由器登陸
"""
from requests.auth import HTTPBasicAuth,HTTPDigestAuth

requests.get('https://api.github.com/user',auth=HTTPBasicAuth('gypsying','password'))


"""
timeout     (鏈接超時，響應超時)
"""
requests.get('http://google.com',timeout=3)
requests.get('http://google.com',timeout=(5,1))


"""
allow_redirects
"""

"""
proxies 應對IP被封的狀況
"""
proxyDict = {
    "http": "61.172.249.96:80",
    "https": "http://61.185.219.126:3128",
}
proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}

"""
stream
"""
from contextlib import closing
with closing(requests.get('xxx',stream=True)) as f:
    for i in f.iter_content():
        print(i)




requests.put()
requests.delete()





BeautifulSoup
	- find()
	- find_all()
	- get()
	- attrs
	- text

soup = BeautifulSoup('html格式字符串','html.parser')
soup = BeautifulSoup('html格式字符串',features='lxml')	第三方，需額外安裝，可是速度比'html.parser'更快


soup = BeautifulSoup('html格式字符串','html.parser')
tag = soup.find(attrs={'class':'c1'})
tag.name  ->  標籤名字

tag = soup.find(attrs={'class':'c1'})
等價於：
tag = soup.find(class_='c1')

print(tag.attrs)

tag.attrs['id'] = 1
del tag.attrs['class']
# attrs 進行增刪改查均可以


tag.children  	全部孩子
tag.descendants	全部後代
tag.find_all()	包含的全部標籤，而且遞歸了
tag.find_all(recursive=False)	包含的全部標籤，不遞歸

tag.clear()		清空內部元素，保留本身
tag.decompose()	遞歸刪除全部標籤，包含本身
res = tag.extract()	至關於字典的pop，其他同decompose()


tag = soup.find(class_='c1')	# 對象
tag.decode()	# 對象變成字符串
tag.encode()	# 對象變成字節

tag.find('a')
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)

find_all()
# tags = soup.find_all('a')
# print(tags)
 
# tags = soup.find_all('a',limit=1)
# print(tags)
 
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
 
 
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
 
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)
 
# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))
 
 
# v = soup.find_all(id=['link1','link2'])
# print(v)
 
# v = soup.find_all(href=['link1','link2'])
# print(v)
 
# ####### 正則 #######
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
 
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
 
# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
 
# ####### 方法篩選 #######
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
# v = soup.find_all(name=func)
# print(v)
 
 
# ## get,獲取標籤屬性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)


from bs4.element import Tag

tag.has_attr()
tag.text  等價於 tag.get_text()


v = tag.index(tag.find('div'))


tag.text
tag.string 也能夠獲取內容，並擴展了修改內容
tag.string = "xxxx"
tag.stripped_strings 至關於join給分割成了list 
tag.children
for item in tag.children:
	print(item,type(item))




from bs4.element import Tag
tag= Tag(name='i',attrs={'id':'it'})
tag.string = "asasasasasasazxzxzx"


soup.find(id='xxx').append(tag)




""" 擴展copy模塊 """
import copy
copy.deepcopy()
...



tag.wrap(tag1)
tag.unwrap()

++++++++++++++++++++++++++++++++++++


內容梳理：
	- 汽車之間新聞爬取示例
	- github和抽屜自動登陸  以及 登錄後的操做
	- requests 和 Beautifulsoup 基本使用
	- 輪訓和長輪詢
	- Django 裏 content-type問題
		request.POST 
		request.body






練習：web微信
	1. 二維碼顯示
	2. 長輪詢 check_login() ：ajax遞歸  （js遞歸沒有層數限制）
	3. 檢測是否已經掃碼
		- 掃碼以後201：替換頭像 base64:...
		src="img_path"
		或者
		src="base64:xxxxxxxx...."
		- 掃碼以後繼續輪訓，等待用戶點擊確認
		- 點擊確認以後，返回200 
			response.text redirect_url-....
		- 獲取最近聯繫人信息






下節課前安裝
	twsited
	scrapy框架

服務器端不能主動給客戶端發消息
可是websocket能夠

- 【輪詢】     	http協議，客戶端輪詢（每秒1次）請求服務端；一次請求，服務端收到後無論有沒有新消息都當即返回
- 【長輪詢】 	http協議，客戶端發來請求，服務器把客戶端給hang住，直到服務端收到新消息併發送給全部客戶端、才斷開鏈接；
				客戶端收到消息後，再當即發請求到服務端進行下一次hang住。
				hang住，有一個超時時間，web微信超時時間是25s
				應用：web微信
- 【WebSocket】	不是http協議，創建在tcp之上
				一次鏈接不斷開，雙工通道，能夠互相發送消息
				可是瀏覽器兼容性不太好，之後將會應用的更普遍

1、爬蟲幾點基礎知識

- 基本操做
	- 概要
		- 發送http請求	requests模塊
		- 提取指定信息 	正則	Beautifulsoup模塊
		- 數據持久化

	- Python的2個模塊
		- requests
		- Beautifulsoup

	- Http請求相關知識
		- 請求
			- 請求頭 
				- cookie
			- 請求體 
				- 發送的內容
		- 響應 
			- 響應頭 
				- 瀏覽器讀取
			- 響應體
				- 看到的內容

		- 特殊
			- cookie
			- csrf_token
			- content-type 用來指定客戶端按照哪一種格式進行解析


- 性能相關
	- 進程
	- 線程
	- 協程

	- 【協程】異步非阻塞：充分利用系統資源


- scrapy框架
	- 學習scrapy的規則


- redis&scrapy組件：完成一個簡單的分佈式爬蟲

2、爬取汽車之家新聞示例

#!/usr/bin/python
# -*- coding:utf-8 -*-

"""
爬取汽車之家的新聞
"""
import os
import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')
"""  指定編碼，不然會亂碼 """
# print(response.apparent_encoding)
# print(response.encoding)
""" Good """
response.encoding = response.apparent_encoding
# print(response.encoding)
# print(type(response.text))      # <class 'str'>
# print(type(response.content))   # <class 'bytes'>

""" BeautifulSoup把各類HTML標籤轉換成各類對象，因此可使用 obj.attr 方式 """
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'})

li_list = tag.find_all('li') # [標籤對象,標籤對象,標籤對象...]
for li in li_list:
    h3 = li.find(name='h3')
    if not h3:
        continue
    else:
        print(h3.text)
        # 獲取屬性
        print(li.find(name='a').get('href'))
        # 或者：print(li.find(name='a').attrs['href'])
        print(li.find('p').text)

        # 下載圖片
        img_url = li.find('img').get('src')
        print(img_url)
        res = requests.get('http:'+img_url)
        img_path = os.path.join('autohome',img_url.split('/')[-1])
        with open(img_path,'wb') as fw:
            fw.write(res.content)

一抹紅的專屬感 Macan Turbo特別版官圖
//www.autohome.com.cn/news/201710/908351.html#pvareaid=102624
[汽車之家 新車官圖]  日前，保時捷發佈了Macan Turbo Exclusive Performance Edition的官圖，做爲一款特別版車...
//www3.autoimg.cn/newsdfs/g10/M0F/B2/EA/120x90_0_autohomecar__wKgH0VnqsC6AYGDFAAGFLm8dSfc007.jpg
還要怎麼輕？ 路特斯Elise Cup 260官圖
//www.autohome.com.cn/news/201710/908350.html#pvareaid=102624
[汽車之家 新車官圖]  日前，路特斯官方宣佈推出Elise Cup 260，這款車相比於已經進行進一步輕量化改造的新款Cup 250要更輕更快，全球...
//www3.autoimg.cn/newsdfs/g18/M0C/B9/7A/120x90_0_autohomecar__wKgH6FnqrhyAH3UDAAFOwoge9w4751.jpg
...

3、自動登陸網站示例

參考：http://www.cnblogs.com/wupeiqi/articles/6283017.htmlgit

　　- .2種網站受權登陸的方式

requests.get()  +  requests.post()

    - 方式1

　　　　1.第一次GET請求獲取token

　　　　2.第二次POST請求進行驗證並獲取cookie

　　　　3.第三次GET/POST請求並攜帶cookie實現用戶登陸後的某些操做

 
    - 方式2

　　　　1.第一次GET請求獲取token和未被受權的cookie

　　　　2.第二次POST請求並攜帶cookie進行驗證並受權

　　　　3.第三次GET/POST請求並攜帶受權過的cookie實現用戶登陸後的某些操做

另外可使用 requests.session() 更簡單的實現：github

session = requests.Session()

session.get()  + session.post()

　　- .自動登陸Github並瀏覽我的主頁

#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup


"""
第二種Python登陸的cookie攜帶方式
以登陸 github帳戶爲例：
    - 第一次去請求 https://github.com/login 這個頁面的時候，服務端就給返回了cookie
    - 第二次去請求 https://github.com/session 進行提交用戶名密碼的時候，要帶上上一次返回的cookie進行受權
    - 第三次去請求用戶登陸後才能看到的頁面（例如我的主頁），須要帶上上面受權好的cookie，才能夠
"""

""" 1.獲取token和cookie """
rsp1 = requests.get(url='https://github.com/login')
soup1 = BeautifulSoup(rsp1.text,'html.parser')
# 根據屬性值找到對應標籤，進而獲取其value值
token = soup1.find(attrs={'name':'authenticity_token'}).get('value')
# 獲取第一次請求得到的cookie
rsp1_cookie_dict = rsp1.cookies.get_dict()
print(token)
print(rsp1_cookie_dict)

""" 2.發起登陸POST請求 """
rsp2 = requests.post(
    url='https://github.com/session',
    data={
        'commit':'Sign in',
        'utf8':'✓',
        'authenticity_token':token,
        'login':'gypsying',
        'password':'xxxxxxxxx',
    },
    cookies=rsp1_cookie_dict
)
# 獲取第二次請求得到的cookie
rsp2_cookie_dict = rsp2.cookies.get_dict()
print(rsp2_cookie_dict)

all_cookie_dict = {}
all_cookie_dict.update(rsp1_cookie_dict)
all_cookie_dict.update(rsp2_cookie_dict)

print(all_cookie_dict)

""" 3.發起查看我的主頁的GET請求 """
rsp3 = requests.get(
    url='https://github.com/Gypsying',
    cookies=all_cookie_dict
)

soup3 = BeautifulSoup(rsp3.text,'html.parser')
email = soup3.find(name='a',attrs={'class':'u-email'}).text
print(email)  # 就能夠拿到了 hitwh_Gypsy@163.com

　　- .自動登陸抽屜並實施點贊操做

import requests
from bs4 import BeautifulSoup

index_url = "http://dig.chouti.com/"
rsp1 = requests.get(index_url)

soup = BeautifulSoup(rsp1.text,'html.parser')
a_list = soup.find_all(attrs={'class':'digg-a'})
id_list = []
# 獲取首頁上全部新聞的id
for item in a_list:
    news_id = item.find(name='i').text
    id_list.append(news_id)

# 得到GET首頁時候返回的 cookie ，此時的cookie是沒有受權的
index_cookie = rsp1.cookies.get_dict()
login_url = "http://dig.chouti.com/login"
data = {
    'phone':8600000000000,
    'password':'xxxxxx',
    'oneMonth':1
}
# 提交用戶名和密碼，並帶上未受權的cookie進行受權
login_ret = requests.post(url=login_url,data=data,cookies=index_cookie)
login_cookie = login_ret.cookies.get_dict()
login_ret = eval(login_ret.text)
code = login_ret.get('result').get('code')
if "9999"  == code:
    print("登陸成功")
else:
    print("登陸失敗")
"""
{"result":{"code":"8887", "message":"手機號格式不對", "data":""}}
{"result":{"code":"21100", "message":"該手機號未註冊", "data":""}}
{"result":{"code":"29998", "message":"手機號或密碼錯誤", "data":{}}}

{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_50613120077"}}}
"""

# 點讚的時候須要帶上上次受權好的cookie
for news_id in id_list:
    like_url = "http://dig.chouti.com/link/vote?linksId={}".format(news_id)
    like_ret = requests.post(url=like_url,cookies=index_cookie)
    print(like_ret.text)

"""
{"result":{"code":"30010", "message":"您已經推薦過了", "data":""}}
{"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_50613120077","likedTime":"1509378903908000","lvCount":"8","nick":"gypsy","uvCount":"1","voteTime":"小於1分鐘前"}}}
"""

4、模擬Web版微信相關操做

"""
微信網頁版登陸示例

GET        https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https%3A%2F%2Fwx.qq.com%2Fcgi-bin%2Fmmwebwx-bin%2Fwebwxnewloginpage&fun=new&lang=zh_CN&_=1508052025433
獲得響應：   window.QRLogin.code = 200; window.QRLogin.uuid = "IapQqsoqcA==";

二維碼src   https://login.weixin.qq.com/qrcode/IapQqsoqcA==

長輪詢：     https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid=IapQqsoqcA==&tip=0&r=-518626217&_=1508052025438
"""

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。