---恢復內容開始---css
'''
今日內容:
1 requests之POST請求
session
cookie
token
2 requests高級用法
3 selenium模塊
'''
'''
# 1 requests之POST請求
請求url:
請求方式: POST
請求頭:
referer:(上一次請求)
user-agent:
請求體:
只有post請求才會有請求體
commit: Sign in
utf8:✓
login:
password:
webauthn-support: unsupported
'''
import requests
import re
# 1 訪問login頁獲取token信息 ''' 請求url: http://github.com/login 請求方式: GET 響應頭: Set-Cookie 請求頭: Cookie User-Agent ''' headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safar' 'i/537.36' } response=requests.get(url='http://home.cnblogs.com/u/feiyufei/', headers=headers) # print(response.text) # 把login頁返回的cookies信息轉換成字典 login_cookies = response.cookies.get_dict() authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />',response.text, re.S)[0] print(authenticity_token) # 2 往session發送POST請求 ''' 請求url: http://github.com/session 請求方式: POST 請求頭: referer: http://github.com/login Cookie:...... USer-Agent: 請求體: (只有POST請求才會有請求體) commit: Sign in utf8: authenticity_token: login: password: ****** webauthn-support: unsupported ''' # 拼接請求頭信息 headers2={ 'referer':'http://github.com/login', 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36' } # 拼接請求體信息 form_data={ "commit":"Sign in", "utf8": "✓", "authenticity_token": authenticity_token, "login": "tankjam", "password": "kermit46709394", "webauthn-support": "unsupported", } # 往session地址發送post請求 # 攜帶請求頭、請求體、login頁的cookies信息 response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies) print(response2.status_code) # print(response2.text) with open('github.html', 'w', encoding='utf-8') as f: f.write(response2.text)
# response響應
1 import requests 2 3 response=requests.get('https://baidu.com') 4 # response響應 5 print(response.status_code) # 獲取響應狀態碼 6 print(response.url) # 獲取url地址 7 print(response.encoding) # 字符編碼 8 response.encoding='utf-8' 9 print(response.text) # 獲取文本 10 print(response.content) # 獲取二進制流 11 print(response.headers) # 獲取頁面請求信息 12 print(response.history) # 上一次跳轉的地址 13 # 1 返回cookie字典 2 返回cookies信息 14 print(response.cookies) # 獲取cookies信息 15 print(response.cookies.get_dict()) # 獲取cookies信息轉換成字典 16 print(response.cookies.items()) # 獲取cookies信息轉換成字典 17 print(response.encoding) 18 print(response.elapsed) # 訪問時間 19 20 21 import requests 22 # 往音頻地址發送get請求 23 url = 'https://vd2.bdstatic.com/mda-ifjegte8t7bxuzbb/hd/mda-ifjegte8t7bxuzbb.mp4' 24 response = requests.get(url, stream=True) # stream=True 把content設置爲一個迭代器對象 25 print(response.content) 26 27 with open('像我這樣的人.mp4', 'wb') as f: 28 for content in response.iter_content(): 29 f.write(content)
#2 requests高級用法html
1 ''' 2 # 1 http://http+ssl(攜帶證書) 3 #證書驗證(大部分網站都是https) 4 ''' 5 import requests 6 # 若是是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端 7 response = requests.get('https://www.xiaohuar.com') 8 print(response.status_code) 9 10 # 改進1:去掉報錯,可是會報警告 11 import requests 12 response = requests.get('https://www.xiaohuar.com', verify=False) 13 # 不驗證證書,報警告,返回200 14 print(response.status_code) 15 16 # 改進2:去掉報錯,而且去掉警報信息 17 import requests 18 import urllib3 19 urllib3.disable_warnings() # 關閉警告 20 response = requests.get('https://www.xiaohuar.com', verify=False) 21 print(response.status_code) 22 23 # 改進3:加上證書 24 # 不少網站都是https,可是不用證書也能夠訪問,大多數狀況都是能夠攜帶也能夠不攜帶證書 25 # 知乎\百度等都是可帶可不帶 26 # 有硬性要求的,則必須帶,好比對於定向的用戶,拿到證書後纔有權限訪問某個特定網站 27 import requests 28 import urllib3 29 # urllib3.disable_warnings() # 關閉警告 30 # 僞代碼 31 response = requests.get( 32 'https://www.xiaohuar.com', 33 # verify=False, 34 cert=('/path/server.crt', '/path/key')) 35 print(response.status_code) 36 37 38 39 ''' 40 # 2 超時設置 41 # 超時設置 42 # 兩種超時:float or tuple 43 # timeout=0.1 # 表明接收數據的超時時間 44 # timeout=(0.1,0.2) # 0.1表明連接超時 0.2表明接收數據的超時時間 45 ''' 46 import requests 47 48 response = requests.get('https://www.baidu.com', 49 timeout=0.0001) 50 51 52 53 ''' 54 # 3 使用代理 55 # 官網連接: http://docs.python-requests.org/en/master/user/advanced/#proxies 56 57 # 代理設置:先發送請求給代理,而後由代理幫忙發送(封ip是常見的事情) 58 ''' 59 import requests 60 proxies={ 61 # 帶用戶名密碼的代理,@符號前是用戶名與密碼 62 'http':'http://tank:123@localhost:9527', 63 'http':'http://localhost:9527', 64 'https':'https://localhost:9527', 65 } 66 response=requests.get('https://www.12306.cn', 67 proxies=proxies) 68 print(response.status_code) 69 70 71 72 ''' 73 爬取西刺免費代理: 74 1.訪問西刺免費代理頁面 75 2.經過re模塊解析並提取全部代理 76 3.經過ip測試網站對爬取的代理進行測試 77 4.若test_ip函數拋出異常表明代理做廢,不然代理有效 78 5.利用有效的代理進行代理測試 79 80 <tr class="odd"> 81 <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td> 82 <td>112.85.131.99</td> 83 <td>9999</td> 84 <td> 85 <a href="/2019-05-09/jiangsu">江蘇南通</a> 86 </td> 87 <td class="country">高匿</td> 88 <td>HTTPS</td> 89 <td class="country"> 90 <div title="0.144秒" class="bar"> 91 <div class="bar_inner fast" style="width:88%"> 92 93 </div> 94 </div> 95 </td> 96 <td class="country"> 97 <div title="0.028秒" class="bar"> 98 <div class="bar_inner fast" style="width:97%"> 99 100 </div> 101 </div> 102 </td> 103 104 <td>6天</td> 105 <td>19-05-16 11:20</td> 106 </tr> 107 re: 108 <tr class="odd">(.*?)</td>.*?<td>(.*?)</td> 109 110 ''' 111 # import requests 112 # import re 113 # import time 114 # 115 # HEADERS = { 116 # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 117 # } 118 # 119 # 120 # def get_index(url): 121 # time.sleep(1) 122 # response = requests.get(url, headers=HEADERS) 123 # return response 124 # 125 # 126 # def parse_index(text): 127 # ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S) 128 # for ip_port in ip_list: 129 # ip = ':'.join(ip_port) 130 # yield ip 131 # 132 # def test_ip(ip): 133 # print('測試ip: %s' % ip) 134 # try: 135 # proxies = { 136 # 'https': ip 137 # } 138 # 139 # # ip測試網站 140 # ip_url = 'https://www.ipip.net/' 141 # 142 # # 使用有效與無效的代理對ip測試站點進行訪問,若返回的結果爲200則表明當前測試ip正常 143 # response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1) 144 # 145 # if response.status_code == 200: 146 # print(f'有用的ip:{ip}') 147 # return ip 148 # 149 # # 若ip代理無效則拋出異常 150 # except Exception as e: 151 # print(e) 152 # 153 # # 使用代理爬取nba 154 # def spider_nba(good_ip): 155 # url = 'https://china.nba.com/' 156 # 157 # proxies = { 158 # 'https': good_ip 159 # } 160 # 161 # response = requests.get(url, headers=HEADERS, proxies=proxies) 162 # print(response.status_code) 163 # print(response.text) 164 # 165 # 166 # if __name__ == '__main__': 167 # base_url = 'https://www.xicidaili.com/nn/{}' 168 # 169 # for line in range(1, 3677): 170 # ip_url = base_url.format(line) 171 # 172 # response = get_index(ip_url) 173 # 174 # # 解析西刺代理獲取每個ip列表 175 # ip_list = parse_index(response.text) 176 # 177 # # 循環每個ip 178 # for ip in ip_list: 179 # # print(ip) 180 # 181 # # 對爬取下來的ip進行測試 182 # good_ip = test_ip(ip) 183 # 184 # if good_ip: 185 # # 真是代理,開始測試 186 # spider_nba(good_ip) 187 188 189 190 191 192 ''' 193 5 認證設置 194 ''' 195 import requests 196 # 經過訪問github的api來測試 197 url = 'https://api.github.com/user' 198 HEADERS = { 199 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 200 } 201 202 # 測試1,失敗返回401 203 # response = requests.get(url, headers=HEADERS) 204 # print(response.status_code) # 401 205 # print(response.text) 206 ''' 207 打印結果: 208 { 209 "message": "Requires authentication", 210 "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user" 211 } 212 ''' 213 # 214 # # 測試2,經過requests.auth內的HTTPBasicAuth進行認證,認證成功返回用戶信息 215 # from requests.auth import HTTPBasicAuth 216 # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394')) 217 # print(response.text) 218 # 219 220 # 測試3,經過requests.get請求內的auth參數默認就是HTTPBasicAuth,認證成功返回用戶信息 221 # response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394')) 222 # print(response.text) 223 224 225 226 227 ''' 228 # 6 上傳文件 229 ''' 230 231 # import requests 232 # 233 # # 上傳文本文件 234 # files1 = {'file': open('user.txt', 'rb')} 235 # response = requests.post('http://httpbin.org/post', files=files1) 236 # print(response.status_code) # 200 237 # print(response.text) # 200 238 239 # # 上傳圖片文件 240 # files2 = {'jpg': open('小狗.jpg', 'rb')} 241 # response = requests.post('http://httpbin.org/post', files=files2) 242 # print(response.status_code) # 200 243 # print(response.text) # 200 244 # 245 # # 上傳視頻文件 246 # files3 = {'movie': open('love_for_GD.mp4', 'rb')} 247 # 248 # response = requests.post('http://httpbin.org/post', files=files3) 249 # print(response.status_code) # 200 250 # print(response.text) # 200
# 3 selenium模塊講解
'''
1 什麼是selenium?
最初是一個自動化測試工具,能夠使用它幫咱們驅動瀏覽器
自動去執行某些自定義好的操做。例如在頁面中執行JS代碼、跳過登陸驗證。
2 爲何要使用selenium
1)優勢:使用requests模塊登陸須要分析大量的複雜通訊流程,使用selenium能夠輕鬆跳過登陸驗證
2)缺點:瀏覽器會加載css、js、圖片、視頻...數據,爬蟲效率相比requests模塊要低
3 如何使用selenium?
下載selenium模塊: pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
下載瀏覽器驅動: http://taobao.org/mirrors/chromedriver/2.38
'''
# selenium之第一次 from selenium import webdriver # 用來驅動瀏覽器 # 調用獲得一個動做鏈對象,破解華東驗證碼的時候用的,能夠拖動圖片 from selenium.webdriver import ActionChains # 按照什麼方式查找屬性,By.ID, By.CSS_SELECIOR, By.Class from selenium.webdriver.common.by import By from selenium.webdriver.common.Keys import Keys #鍵盤按鍵操做 # 和下面WebDriverWait一塊兒用的,EC是expected_conditions的別名 from selenium.webdriver.support import expected_conditions # 等待頁面加載某些元素 from selenium.webdriver.support.wait import WebDriverWait import time # 經過谷歌瀏覽器驅動打開谷歌瀏覽器 # webdriver.Chrome(r'chromedriver.exe的絕對路徑) chrome=webdriver.Chrome(r'D:\chromedriver_win32\chromedriver.exe') # ()內輸入chromedriver.exe的絕對路徑 #chromedriver.exe存放於python解釋器的Scripts文件夾中 # chrome=webdriver.Chrome() # 若try出現異常 try: chrome.get('https://cnblogs.com/feiyufei/') time.sleep(3) # 不管發生什麼都會關閉瀏覽器 finally: # 關閉瀏覽器 chrome.close() # # 1 若try出現異常 try: # 往博客主頁發送get請求 # chrome.get('https://www.cnblogs.com/feiyufei/') # 參數1: 驅動對象 參數2: 等待時間 wait = WebDriverWait(chrome, 10) # 一、訪問百度 chrome.get('https://www.baidu.com/') # 二、查找input輸入框 input_tag = wait.until( # 調用EC的presence_of_element_located() EC.presence_of_element_located( # 此處能夠寫一個元組 # 參數1: 查找屬性的方式 # 參數2: 屬性的名字 (By.ID, "kw") ) ) input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw"))) # 三、搜索一拳超人 input_tag.send_keys('一拳超人') # 四、按鍵盤迴車鍵 input_tag.send_keys(Keys.ENTER) time.sleep(3) # 不管發生什麼都會關閉瀏覽器 finally: # 關閉瀏覽器 chrome.close() # 2 try: # 往tank博客主頁發送get請求 # chrome.get('https://www.cnblogs.com/feiyufei/') # 參數1: 驅動對象 參數2: 等待時間 wait = WebDriverWait(chrome, 10) # 一、訪問京東主頁 chrome.get('https://www.jd.com/') # 二、查找input輸入框 input_tag = wait.until(EC.presence_of_element_located((By.ID, "key"))) # 三、搜索唐詩三百首 input_tag.send_keys('唐詩三百首') # 四、根據class屬性名稱查找標籤 search_button = wait.until( EC.presence_of_element_located((By.CLASS_NAME, 'button'))) # 五、點擊搜索按鈕 search_button.click() time.sleep(3) # 不管發生什麼都會關閉瀏覽器 finally: # 關閉瀏覽器 chrome.close()
# selenium之基本選擇器
from selenium import webdriver # 用來驅動瀏覽器的 import time ''' 隱式等待 ''' # 獲取驅動對象、 driver = webdriver.Chrome() try: # 顯式等待: 等待某個元素加載 # 參數1: 驅動對象 參數2: 等待時間 # wait = WebDriverWait(chrome, 10) driver.get('https://china.nba.com/') # 隱式等待: 等待頁面全部元素加載 driver.implicitly_wait(10) news_tag = driver.find_element_by_class_name('nav-news') # 獲取標籤對象 print(news_tag) # 獲取標籤的名字 print(news_tag.tag_name) time.sleep(10) finally: driver.close() from selenium import webdriver # 用來驅動瀏覽器的 import time ''' ===============全部方法=================== element是查找一個標籤 elements是查找全部標籤 一、find_element_by_link_text 經過連接文本去找 二、find_element_by_id 經過id去找 三、find_element_by_class_name 四、find_element_by_partial_link_text 五、find_element_by_name 六、find_element_by_css_selector 七、find_element_by_tag_name ''' # 獲取驅動對象、 driver = webdriver.Chrome() try: # 往百度發送請求 driver.get('https://www.baidu.com/') driver.implicitly_wait(10) # 一、find_element_by_link_text 經過連接文本去找 # 根據登陸 # send_tag = driver.find_element_by_link_text('登陸') # send_tag.click() # 二、find_element_by_partial_link_text 經過局部文本查找a標籤 login_button = driver.find_element_by_partial_link_text('登') login_button.click() time.sleep(1) # 三、find_element_by_class_name 根據class屬性名查找 login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin') login_tag.click() time.sleep(1) # 四、find_element_by_name 根據name屬性查找 username = driver.find_element_by_name('userName') username.send_keys('15622792660') time.sleep(1) # 五、find_element_by_id 經過id屬性名查找 password = driver.find_element_by_id('TANGRAM__PSP_10__password') password.send_keys('*******') time.sleep(1) # 六、find_element_by_css_selector 根據屬性選擇器查找 # 根據id查找登陸按鈕 login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit') # driver.find_element_by_css_selector('.pass-button-submit') login_submit.click() # 七、find_element_by_tag_name 根據標籤名稱查找標籤 div = driver.find_element_by_tag_name('div') print(div.tag_name) time.sleep(10) finally: driver.close()
做業: 爬取快代理(參考爬取西刺代理代碼) https://www.kuaidaili.com/free/
熟悉selenium模塊,敲課上例子
自動登陸抽屜新熱榜
'''
from selenium import webdriver import time driver = webdriver.Chrome(r'D:\chromedriver_win32\chromedriver.exe') # 把窗口轉成全屏 driver.maximize_window() try: driver.get('https://dig.chouti.com/') driver.implicitly_wait(10) time.sleep(5) # 一、點擊登陸 login_btn = driver.find_element_by_id('login_btn') login_btn.click() time.sleep(2) # 二、輸入用戶名 phone = driver.find_element_by_class_name('login-phone') phone.send_keys('15622792660') # 三、輸入密碼 pwd = driver.find_element_by_class_name('pwd-password-input') pwd.send_keys('kermit46709394') # 四、確認登陸 login_submit = driver.find_element_by_class_name('btn-large') login_submit.click() time.sleep(20) # 捕獲異常並打印 except Exception as e: print(e) finally: driver.close()
---恢復內容結束---python