Learn python the fifth day.

---恢復內容開始---css

'''
今日內容:
1 requests之POST請求
session
cookie
token
2 requests高級用法
3 selenium模塊

'''

'''
# 1 requests之POST請求
請求url:
請求方式: POST
請求頭:
referer:(上一次請求)
user-agent:
請求體:
只有post請求才會有請求體
commit: Sign in
utf8:✓

login:
password:
webauthn-support: unsupported


'''
import requests
import re

# 1 訪問login頁獲取token信息
'''
請求url: http://github.com/login
請求方式: GET
響應頭: Set-Cookie
請求頭: Cookie
        User-Agent
'''
headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safar'
                 'i/537.36'
}
response=requests.get(url='http://home.cnblogs.com/u/feiyufei/', headers=headers)
# print(response.text)
# 把login頁返回的cookies信息轉換成字典
login_cookies = response.cookies.get_dict()

authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />',response.text, re.S)[0]
print(authenticity_token)


# 2 往session發送POST請求
'''
請求url: http://github.com/session
請求方式: POST
請求頭: referer: http://github.com/login
         Cookie:......
         USer-Agent:
請求體: (只有POST請求才會有請求體)
         commit: Sign in
         utf8:
         authenticity_token:

         login:
         password: ******
         webauthn-support: unsupported

'''

# 拼接請求頭信息
headers2={
    'referer':'http://github.com/login',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
# 拼接請求體信息
form_data={
    "commit":"Sign in",
    "utf8": "",
    "authenticity_token": authenticity_token,
    "login": "tankjam",
    "password": "kermit46709394",
    "webauthn-support": "unsupported",
}
# 往session地址發送post請求
# 攜帶請求頭、請求體、login頁的cookies信息
response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies)
print(response2.status_code)
# print(response2.text)
with open('github.html', 'w', encoding='utf-8') as f:
    f.write(response2.text)
# response響應
 1 import requests
 2 
 3 response=requests.get('https://baidu.com')
 4 # response響應
 5 print(response.status_code)         # 獲取響應狀態碼
 6 print(response.url)                 # 獲取url地址
 7 print(response.encoding)            # 字符編碼
 8 response.encoding='utf-8'
 9 print(response.text)                # 獲取文本
10 print(response.content)             # 獲取二進制流
11 print(response.headers)             # 獲取頁面請求信息
12 print(response.history)             # 上一次跳轉的地址
13 # 1 返回cookie字典 2 返回cookies信息
14 print(response.cookies)             # 獲取cookies信息
15 print(response.cookies.get_dict())  # 獲取cookies信息轉換成字典
16 print(response.cookies.items())     # 獲取cookies信息轉換成字典
17 print(response.encoding)
18 print(response.elapsed)             # 訪問時間
19 
20 
21 import requests
22 # 往音頻地址發送get請求
23 url = 'https://vd2.bdstatic.com/mda-ifjegte8t7bxuzbb/hd/mda-ifjegte8t7bxuzbb.mp4'
24 response = requests.get(url, stream=True)  # stream=True 把content設置爲一個迭代器對象
25 print(response.content)
26 
27 with open('像我這樣的人.mp4', 'wb') as f:
28     for content in response.iter_content():
29         f.write(content)

 

#2  requests高級用法html

  1 '''
  2 # 1 http://http+ssl(攜帶證書)
  3 #證書驗證(大部分網站都是https)
  4 '''
  5 import requests
  6 # 若是是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端
  7 response = requests.get('https://www.xiaohuar.com')
  8 print(response.status_code)
  9 
 10 # 改進1:去掉報錯,可是會報警告
 11 import requests
 12 response = requests.get('https://www.xiaohuar.com', verify=False)
 13 # 不驗證證書,報警告,返回200
 14 print(response.status_code)
 15 
 16 # 改進2:去掉報錯,而且去掉警報信息
 17 import requests
 18 import urllib3
 19 urllib3.disable_warnings()  # 關閉警告
 20 response = requests.get('https://www.xiaohuar.com', verify=False)
 21 print(response.status_code)
 22 
 23 # 改進3:加上證書
 24 # 不少網站都是https,可是不用證書也能夠訪問,大多數狀況都是能夠攜帶也能夠不攜帶證書
 25 # 知乎\百度等都是可帶可不帶
 26 # 有硬性要求的,則必須帶,好比對於定向的用戶,拿到證書後纔有權限訪問某個特定網站
 27 import requests
 28 import urllib3
 29 # urllib3.disable_warnings()  # 關閉警告
 30 # 僞代碼
 31 response = requests.get(
 32     'https://www.xiaohuar.com',
 33     # verify=False,
 34     cert=('/path/server.crt', '/path/key'))
 35 print(response.status_code)
 36 
 37 
 38 
 39 '''
 40 # 2 超時設置
 41 # 超時設置
 42 # 兩種超時:float or tuple
 43 # timeout=0.1  # 表明接收數據的超時時間
 44 # timeout=(0.1,0.2)  # 0.1表明連接超時  0.2表明接收數據的超時時間
 45 '''
 46 import requests
 47 
 48 response = requests.get('https://www.baidu.com',
 49                         timeout=0.0001)
 50 
 51 
 52 
 53 '''
 54 # 3 使用代理
 55 # 官網連接: http://docs.python-requests.org/en/master/user/advanced/#proxies
 56 
 57 # 代理設置:先發送請求給代理,而後由代理幫忙發送(封ip是常見的事情)
 58 '''
 59 import requests
 60 proxies={
 61     # 帶用戶名密碼的代理,@符號前是用戶名與密碼
 62     'http':'http://tank:123@localhost:9527',
 63     'http':'http://localhost:9527',
 64     'https':'https://localhost:9527',
 65 }
 66 response=requests.get('https://www.12306.cn',
 67                      proxies=proxies)
 68 print(response.status_code)
 69 
 70 
 71 
 72 '''
 73 爬取西刺免費代理:
 74     1.訪問西刺免費代理頁面
 75     2.經過re模塊解析並提取全部代理
 76     3.經過ip測試網站對爬取的代理進行測試
 77     4.若test_ip函數拋出異常表明代理做廢,不然代理有效
 78     5.利用有效的代理進行代理測試
 79 
 80 <tr class="odd">
 81       <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
 82       <td>112.85.131.99</td>
 83       <td>9999</td>
 84       <td>
 85         <a href="/2019-05-09/jiangsu">江蘇南通</a>
 86       </td>
 87       <td class="country">高匿</td>
 88       <td>HTTPS</td>
 89       <td class="country">
 90         <div title="0.144秒" class="bar">
 91           <div class="bar_inner fast" style="width:88%">
 92 
 93           </div>
 94         </div>
 95       </td>
 96       <td class="country">
 97         <div title="0.028秒" class="bar">
 98           <div class="bar_inner fast" style="width:97%">
 99 
100           </div>
101         </div>
102       </td>
103 
104       <td>6天</td>
105       <td>19-05-16 11:20</td>
106     </tr>
107 re:
108     <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>
109 
110 '''
111 # import requests
112 # import re
113 # import time
114 #
115 # HEADERS = {
116 #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
117 # }
118 #
119 #
120 # def get_index(url):
121 #     time.sleep(1)
122 #     response = requests.get(url, headers=HEADERS)
123 #     return response
124 #
125 #
126 # def parse_index(text):
127 #     ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S)
128 #     for ip_port in ip_list:
129 #         ip = ':'.join(ip_port)
130 #         yield ip
131 #
132 # def test_ip(ip):
133 #     print('測試ip: %s' % ip)
134 #     try:
135 #         proxies = {
136 #             'https': ip
137 #         }
138 #
139 #         # ip測試網站
140 #         ip_url = 'https://www.ipip.net/'
141 #
142 #         # 使用有效與無效的代理對ip測試站點進行訪問,若返回的結果爲200則表明當前測試ip正常
143 #         response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1)
144 #
145 #         if response.status_code == 200:
146 #             print(f'有用的ip:{ip}')
147 #             return ip
148 #
149 #     # 若ip代理無效則拋出異常
150 #     except Exception as e:
151 #         print(e)
152 #
153 # # 使用代理爬取nba
154 # def spider_nba(good_ip):
155 #     url = 'https://china.nba.com/'
156 #
157 #     proxies = {
158 #         'https': good_ip
159 #     }
160 #
161 #     response = requests.get(url, headers=HEADERS, proxies=proxies)
162 #     print(response.status_code)
163 #     print(response.text)
164 #
165 #
166 # if __name__ == '__main__':
167 #     base_url = 'https://www.xicidaili.com/nn/{}'
168 #
169 #     for line in range(1, 3677):
170 #         ip_url = base_url.format(line)
171 #
172 #         response = get_index(ip_url)
173 #
174 #         # 解析西刺代理獲取每個ip列表
175 #         ip_list = parse_index(response.text)
176 #
177 #         # 循環每個ip
178 #         for ip in ip_list:
179 #             # print(ip)
180 #
181 #             # 對爬取下來的ip進行測試
182 #             good_ip = test_ip(ip)
183 #
184 #             if good_ip:
185 #                 # 真是代理,開始測試
186 #                 spider_nba(good_ip)
187 
188 
189 
190 
191 
192 '''
193 5 認證設置
194 '''
195 import requests
196 # 經過訪問github的api來測試
197 url = 'https://api.github.com/user'
198 HEADERS = {
199     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
200 }
201 
202 # 測試1,失敗返回401
203 # response = requests.get(url, headers=HEADERS)
204 # print(response.status_code)  # 401
205 # print(response.text)
206 '''
207 打印結果:
208     {
209       "message": "Requires authentication",
210       "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
211     }
212 '''
213 #
214 # # 測試2,經過requests.auth內的HTTPBasicAuth進行認證,認證成功返回用戶信息
215 # from requests.auth import HTTPBasicAuth
216 # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394'))
217 # print(response.text)
218 #
219 
220 # 測試3,經過requests.get請求內的auth參數默認就是HTTPBasicAuth,認證成功返回用戶信息
221 # response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394'))
222 # print(response.text)
223 
224 
225 
226 
227 '''
228 # 6 上傳文件
229 '''
230 
231 # import requests
232 # 
233 # # 上傳文本文件
234 # files1 = {'file': open('user.txt', 'rb')}
235 # response = requests.post('http://httpbin.org/post', files=files1)
236 # print(response.status_code)  # 200
237 # print(response.text)  # 200
238 
239 # # 上傳圖片文件
240 # files2 = {'jpg': open('小狗.jpg', 'rb')}
241 # response = requests.post('http://httpbin.org/post', files=files2)
242 # print(response.status_code)  # 200
243 # print(response.text)  # 200
244 #
245 # # 上傳視頻文件
246 # files3 = {'movie': open('love_for_GD.mp4', 'rb')}
247 #
248 # response = requests.post('http://httpbin.org/post', files=files3)
249 # print(response.status_code)  # 200
250 # print(response.text)  # 200

# 3 selenium模塊講解

'''
1 什麼是selenium?
最初是一個自動化測試工具,能夠使用它幫咱們驅動瀏覽器
自動去執行某些自定義好的操做。例如在頁面中執行JS代碼、跳過登陸驗證。

2 爲何要使用selenium
1)優勢:使用requests模塊登陸須要分析大量的複雜通訊流程,使用selenium能夠輕鬆跳過登陸驗證
2)缺點:瀏覽器會加載css、js、圖片、視頻...數據,爬蟲效率相比requests模塊要低
3 如何使用selenium?
下載selenium模塊: pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
下載瀏覽器驅動: http://taobao.org/mirrors/chromedriver/2.38

'''
# selenium之第一次
from selenium import webdriver   # 用來驅動瀏覽器

# 調用獲得一個動做鏈對象,破解華東驗證碼的時候用的,能夠拖動圖片
from selenium.webdriver import ActionChains

# 按照什麼方式查找屬性,By.ID, By.CSS_SELECIOR, By.Class
from selenium.webdriver.common.by import  By
from selenium.webdriver.common.Keys import Keys    #鍵盤按鍵操做

# 和下面WebDriverWait一塊兒用的,EC是expected_conditions的別名
from selenium.webdriver.support import expected_conditions

# 等待頁面加載某些元素
from selenium.webdriver.support.wait import WebDriverWait

import time
# 經過谷歌瀏覽器驅動打開谷歌瀏覽器
# webdriver.Chrome(r'chromedriver.exe的絕對路徑)
chrome=webdriver.Chrome(r'D:\chromedriver_win32\chromedriver.exe')   # ()內輸入chromedriver.exe的絕對路徑

#chromedriver.exe存放於python解釋器的Scripts文件夾中
# chrome=webdriver.Chrome()



# 若try出現異常
try:
    chrome.get('https://cnblogs.com/feiyufei/')
    time.sleep(3)

    # 不管發生什麼都會關閉瀏覽器
finally:
    # 關閉瀏覽器
    chrome.close()


#
# 1 若try出現異常
try:
    # 往博客主頁發送get請求
    # chrome.get('https://www.cnblogs.com/feiyufei/')

    # 參數1: 驅動對象  參數2: 等待時間
    wait = WebDriverWait(chrome, 10)

    # 一、訪問百度
    chrome.get('https://www.baidu.com/')

    # 二、查找input輸入框
    input_tag = wait.until(
        # 調用EC的presence_of_element_located()
        EC.presence_of_element_located(
            # 此處能夠寫一個元組
            # 參數1: 查找屬性的方式
            # 參數2: 屬性的名字
            (By.ID, "kw")
        )
    )
    input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw")))

    # 三、搜索一拳超人
    input_tag.send_keys('一拳超人')

    # 四、按鍵盤迴車鍵
    input_tag.send_keys(Keys.ENTER)

    time.sleep(3)

# 不管發生什麼都會關閉瀏覽器
finally:
    # 關閉瀏覽器
    chrome.close()


# 2
try:
    # 往tank博客主頁發送get請求
    # chrome.get('https://www.cnblogs.com/feiyufei/')

    # 參數1: 驅動對象  參數2: 等待時間
    wait = WebDriverWait(chrome, 10)

    # 一、訪問京東主頁
    chrome.get('https://www.jd.com/')

    # 二、查找input輸入框
    input_tag = wait.until(EC.presence_of_element_located((By.ID, "key")))

    # 三、搜索唐詩三百首
    input_tag.send_keys('唐詩三百首')

    # 四、根據class屬性名稱查找標籤
    search_button = wait.until(
        EC.presence_of_element_located((By.CLASS_NAME, 'button')))
    # 五、點擊搜索按鈕
    search_button.click()

    time.sleep(3)

# 不管發生什麼都會關閉瀏覽器
finally:
    # 關閉瀏覽器
    chrome.close()
# selenium之基本選擇器

from selenium import webdriver  # 用來驅動瀏覽器的
import time

'''
隱式等待
'''
# 獲取驅動對象、
driver = webdriver.Chrome()

try:
    # 顯式等待: 等待某個元素加載
    # 參數1: 驅動對象  參數2: 等待時間
    # wait = WebDriverWait(chrome, 10)

    driver.get('https://china.nba.com/')

    # 隱式等待: 等待頁面全部元素加載
    driver.implicitly_wait(10)
    news_tag = driver.find_element_by_class_name('nav-news')
    # 獲取標籤對象
    print(news_tag)
    # 獲取標籤的名字
    print(news_tag.tag_name)


    time.sleep(10)

finally:
    driver.close()



from selenium import webdriver  # 用來驅動瀏覽器的
import time

'''
===============全部方法===================
    element是查找一個標籤
    elements是查找全部標籤

    一、find_element_by_link_text  經過連接文本去找
    二、find_element_by_id 經過id去找
    三、find_element_by_class_name
    四、find_element_by_partial_link_text
    五、find_element_by_name
    六、find_element_by_css_selector
    七、find_element_by_tag_name
'''
# 獲取驅動對象、
driver = webdriver.Chrome()

try:

    # 往百度發送請求
    driver.get('https://www.baidu.com/')
    driver.implicitly_wait(10)

    # 一、find_element_by_link_text  經過連接文本去找
    # 根據登陸
    # send_tag = driver.find_element_by_link_text('登陸')
    # send_tag.click()

    # 二、find_element_by_partial_link_text 經過局部文本查找a標籤
    login_button = driver.find_element_by_partial_link_text('')
    login_button.click()
    time.sleep(1)

    # 三、find_element_by_class_name 根據class屬性名查找
    login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')
    login_tag.click()
    time.sleep(1)

    # 四、find_element_by_name 根據name屬性查找
    username = driver.find_element_by_name('userName')
    username.send_keys('15622792660')
    time.sleep(1)

    # 五、find_element_by_id 經過id屬性名查找
    password = driver.find_element_by_id('TANGRAM__PSP_10__password')
    password.send_keys('*******')
    time.sleep(1)

    # 六、find_element_by_css_selector  根據屬性選擇器查找
    # 根據id查找登陸按鈕
    login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')
    # driver.find_element_by_css_selector('.pass-button-submit')
    login_submit.click()

    # 七、find_element_by_tag_name  根據標籤名稱查找標籤
    div = driver.find_element_by_tag_name('div')
    print(div.tag_name)

    time.sleep(10)

finally:
    driver.close()
做業:  爬取快代理(參考爬取西刺代理代碼) https://www.kuaidaili.com/free/
熟悉selenium模塊,敲課上例子
自動登陸抽屜新熱榜
'''
 
from selenium import webdriver
import time

driver = webdriver.Chrome(r'D:\chromedriver_win32\chromedriver.exe')

# 把窗口轉成全屏
driver.maximize_window()

try:
    driver.get('https://dig.chouti.com/')
    driver.implicitly_wait(10)
    time.sleep(5)

    # 一、點擊登陸
    login_btn = driver.find_element_by_id('login_btn')
    login_btn.click()
    time.sleep(2)

    # 二、輸入用戶名
    phone = driver.find_element_by_class_name('login-phone')
    phone.send_keys('15622792660')

    # 三、輸入密碼
    pwd = driver.find_element_by_class_name('pwd-password-input')
    pwd.send_keys('kermit46709394')

    # 四、確認登陸
    login_submit = driver.find_element_by_class_name('btn-large')
    login_submit.click()

    time.sleep(20)

# 捕獲異常並打印
except Exception as e:
    print(e)

finally:
    driver.close()

 



---恢復內容結束---python

相關文章
相關標籤/搜索