上週做業css
1 '''''' 2 ''' 3 爬取豆瓣TOP250電影信息 4 5 主頁: 6 第一頁: 7 https://movie.douban.com/top250?start=0&filter= 8 第二頁: 9 https://movie.douban.com/top250?start=25&filter= 10 第三頁: 11 https://movie.douban.com/top250?start=50&filter= 12 第四頁: 13 https://movie.douban.com/top250?start=75&filter= 14 第十頁: 15 https://movie.douban.com/top250?start=225&filter= 16 17 GET 18 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 19 20 re正則: 21 # 電影詳情頁url、圖片連接、電影名稱、導演、主演、電影上映時間、電影評分、評價人數、簡介 22 <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?導演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價.*?<span class="inq">(.*?)</span> 23 ''' 24 import requests 25 import re 26 27 28 headers = { 29 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' 30 } 31 base_url = 'https://movie.douban.com/top250?start={}&filter=' 32 33 n = 0 34 for line in range(10): 35 url = base_url.format(n) 36 print(type(n)) 37 n += 25 38 print(url) 39 40 # 一、往豆瓣TOP250發送請求獲取響應數據 41 response = requests.get(url, headers=headers) 42 43 # print(response.text) 44 45 # 二、經過正則解析提取數據 46 # 電影詳情頁url、圖片連接、電影名稱、電影評分、評價人數 47 movie_content_list = re.findall( 48 # 正則規則 49 # '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價', 50 '<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?導演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價.*?<span class="inq">(.*?)</span>', 51 52 # 解析文本 53 response.text, 54 55 # 匹配模式 56 re.S) 57 58 for movie_content in movie_content_list: 59 # 解壓賦值每一部電影 60 detail_url, movie_jpg, name, daoyan, timer,point, num, desc= movie_content 61 data = f'電影名稱:{name}, 詳情頁url:{detail_url}, 圖片url:{movie_jpg}, 導演: {daoyan} 上映時間: {timer}評分: {point}, 評價人數: {num} 簡介:{desc}\n' 62 print(data) 63 64 # 三、保存數據,把電影信息寫入文件中 65 with open('douban.txt', 'a', encoding='utf-8') as f: 66 f.write(data)
requests之post請求html
1 '''''' 2 ''' 3 post請求登錄github 4 ''' 5 import requests 6 import re 7 8 # 一 訪問login頁獲取token信息 9 ''' 10 請求url: 11 https://github.com/login 12 請求方式: 13 GET 14 響應頭: 15 Set-Cookie 16 請求頭: 17 Cookie 18 User-Agent 19 ''' 20 headers = { 21 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' 22 } 23 24 response = requests.get(url='https://github.com/login', headers=headers) 25 # print(response.text) 26 # 把login頁返回的cookies信息轉換成字典 27 login_cookies = response.cookies.get_dict() 28 29 authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)[0] 30 31 print(authenticity_token) 32 33 34 35 # 二 往sessionurl發送POST請求 36 ''' 37 38 請求url: 39 https://github.com/session 40 41 請求方式: 42 POST 43 44 請求頭: 45 # 上一次請求從哪裏來 46 Referer: https://github.com/login 47 Cookie:... 48 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 49 50 請求體: 51 只有POST請求才會有請求體。 52 commit: Sign in 53 utf8: ✓ 54 authenticity_token: 55 VX79esFc0YPdR1UFzUM/6MTRZOlYQ0btF5k2/x7uZea0x2E6W4bmRpwHsaCBN+096PaWNkcQjJOsyUzUqsAhIw== 56 LLWlTr0qLcYC74hn7OI7IlyeB9rZei9737Lqtzz0sKTgY7Js7pUUhZ6bNC6lCkS+OHfVukkbTejjd0BnjPvGUg== 57 login: tankjam1 58 password: ***** 59 webauthn-support: unsupported 60 ''' 61 # 拼接請求頭信息 62 headers2 = { 63 'Referer': 'https://github.com/login', 64 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', 65 } 66 67 # 拼接請求體信息 68 form_data = { 69 "commit": "Sign in", 70 "utf8": "✓", 71 "authenticity_token": authenticity_token, 72 "login": "tankjam", 73 "password": "kermit46709394", 74 "webauthn-support": "unsupported", 75 } 76 77 # 往session地址發送post請求 78 # 攜帶請求頭、請求體、login頁的cookies信息 79 response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies) 80 print(response2.status_code) 81 # print(response2.text) 82 with open('github.html', 'w', encoding='utf-8') as f: 83 f.write(response2.text)
requests響應
1 # import requests 2 # 3 # response = requests.get('https://baidu.com') 4 # # response響應 5 # print(response.status_code) # 獲取響應狀態碼 6 # print(response.url) # 獲取url地址 7 # print(response.encoding) # 字符編碼 8 # response.encoding = 'utf-8' 9 # print(response.text) # 獲取文本 10 # print(response.content) # 獲取二進制流 11 # print(response.headers) # 獲取頁面請求頭信息 12 # print(response.history) # 上一次跳轉的地址 13 # # 一、返回cookie字典 二、返回cookies對象 14 # print(response.cookies) # 獲取cookies信息, 15 # print(response.cookies.get_dict()) # 獲取cookies信息轉換成字典 16 # print(response.cookies.items()) # 獲取cookies信息轉換成字典 17 # print(response.encoding) 18 # print(response.elapsed) # 訪問時間 19 20 # import requests 21 # # 往音頻地址發送get請求 22 # url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4' 23 # response = requests.get(url, stream=True) # stream=True 把content設置爲一個迭代器對象 24 # print(response.content) 25 # 26 # with open('love_for_GD.mp4', 'wb') as f: 27 # for content in response.iter_content(): 28 # f.write(content)
requests高級用法
1 ''' 2 ''' 3 ''' 4 證書驗證(大部分網站都是https) 5 ''' 6 import requests 7 # # 若是是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端 8 # response = requests.get('https://www.xiaohuar.com') 9 # print(response.status_code) 10 11 # 改進1:去掉報錯,可是會報警告 12 # import requests 13 # response = requests.get('https://www.xiaohuar.com', verify=False) 14 # # 不驗證證書,報警告,返回200 15 # print(response.status_code) 16 17 # 改進2:去掉報錯,而且去掉警報信息 18 # import requests 19 # import urllib3 20 # urllib3.disable_warnings() # 關閉警告 21 # response = requests.get('https://www.xiaohuar.com', verify=False) 22 # print(response.status_code) 23 24 # 改進3:加上證書 25 # 不少網站都是https,可是不用證書也能夠訪問,大多數狀況都是能夠攜帶也能夠不攜帶證書 26 # 知乎\百度等都是可帶可不帶 27 # 有硬性要求的,則必須帶,好比對於定向的用戶,拿到證書後纔有權限訪問某個特定網站 28 # import requests 29 # import urllib3 30 # # urllib3.disable_warnings() # 關閉警告 31 # # 僞代碼 32 # response = requests.get( 33 # 'https://www.xiaohuar.com', 34 # # verify=False, 35 # # /path/server.crt證書的存放目錄, /path/key 36 # cert=('/path/server.crt', '/path/key')) 37 # print(response.status_code) 38 39 40 ''' 41 超時設置 42 ''' 43 44 # 超時設置 45 # 兩種超時:float or tuple 46 # timeout=0.1 # 表明接收數據的超時時間 47 # timeout=(0.1,0.2) # 0.1表明連接超時 0.2表明接收數據的超時時間 48 49 # import requests 50 # response = requests.get('https://www.baidu.com', 51 # timeout=0.0001) 52 # # print(response.elapsed) 53 # print(response.status_code) 54 55 ''' 56 代理設置:先發送請求給代理,而後由代理幫忙發送(封ip是常見的事情) 57 ''' 58 # import requests 59 # proxies={ 60 # # 帶用戶名密碼的代理,@符號前是用戶名與密碼 61 # 'http':'http://tank:123@localhost:9527', 62 # 'http':'http://localhost:9527', 63 # 'https':'https://localhost:9527', 64 # } 65 # response=requests.get('https://www.12306.cn', 66 # proxies=proxies) 67 # 68 # print(response.status_code) 69 ''' 70 爬取西刺免費代理: 71 1.訪問西刺免費代理頁面 72 2.經過re模塊解析並提取全部代理 73 3.經過ip測試網站對爬取的代理進行測試 74 4.若test_ip函數拋出異常表明代理做廢,不然代理有效 75 5.利用有效的代理進行代理測試 76 77 <tr class="odd"> 78 <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td> 79 <td>112.85.131.99</td> 80 <td>9999</td> 81 <td> 82 <a href="/2019-05-09/jiangsu">江蘇南通</a> 83 </td> 84 <td class="country">高匿</td> 85 <td>HTTPS</td> 86 <td class="country"> 87 <div title="0.144秒" class="bar"> 88 <div class="bar_inner fast" style="width:88%"> 89 90 </div> 91 </div> 92 </td> 93 <td class="country"> 94 <div title="0.028秒" class="bar"> 95 <div class="bar_inner fast" style="width:97%"> 96 97 </div> 98 </div> 99 </td> 100 101 <td>6天</td> 102 <td>19-05-16 11:20</td> 103 </tr> 104 re: 105 <tr class="odd">(.*?)</td>.*?<td>(.*?)</td> 106 107 ''' 108 # import requests 109 # import re 110 # import time 111 # 112 # HEADERS = { 113 # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 114 # } 115 # 116 # 117 # def get_index(url): 118 # time.sleep(1) 119 # response = requests.get(url, headers=HEADERS) 120 # return response 121 # 122 # 123 # def parse_index(text): 124 # ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S) 125 # for ip_port in ip_list: 126 # ip = ':'.join(ip_port) 127 # yield ip 128 # 129 # def test_ip(ip): 130 # print('測試ip: %s' % ip) 131 # try: 132 # proxies = { 133 # 'https': ip 134 # } 135 # 136 # # ip測試網站 137 # ip_url = 'https://www.ipip.net/' 138 # 139 # # 使用有效與無效的代理對ip測試站點進行訪問,若返回的結果爲200則表明當前測試ip正常 140 # response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1) 141 # 142 # if response.status_code == 200: 143 # print(f'有用的ip:{ip}') 144 # return ip 145 # 146 # # 若ip代理無效則拋出異常 147 # except Exception as e: 148 # print(e) 149 # 150 # # 使用代理爬取nba 151 # def spider_nba(good_ip): 152 # url = 'https://china.nba.com/' 153 # 154 # proxies = { 155 # 'https': good_ip 156 # } 157 # 158 # response = requests.get(url, headers=HEADERS, proxies=proxies) 159 # print(response.status_code) 160 # print(response.text) 161 # 162 # 163 # if __name__ == '__main__': 164 # base_url = 'https://www.xicidaili.com/nn/{}' 165 # 166 # for line in range(1, 3677): 167 # ip_url = base_url.format(line) 168 # 169 # response = get_index(ip_url) 170 # 171 # # 解析西刺代理獲取每個ip列表 172 # ip_list = parse_index(response.text) 173 # 174 # # 循環每個ip 175 # for ip in ip_list: 176 # # print(ip) 177 # 178 # # 對爬取下來的ip進行測試 179 # good_ip = test_ip(ip) 180 # 181 # if good_ip: 182 # # 真是代理,開始測試 183 # spider_nba(good_ip) 184 185 186 187 ''' 188 認證設置 189 ''' 190 import requests 191 # 經過訪問github的api來測試 192 url = 'https://api.github.com/user' 193 HEADERS = { 194 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 195 } 196 197 # 測試1,失敗返回401 198 # response = requests.get(url, headers=HEADERS) 199 # print(response.status_code) # 401 200 # print(response.text) 201 ''' 202 打印結果: 203 { 204 "message": "Requires authentication", 205 "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user" 206 } 207 ''' 208 # 209 # # 測試2,經過requests.auth內的HTTPBasicAuth進行認證,認證成功返回用戶信息 210 # from requests.auth import HTTPBasicAuth 211 # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394')) 212 # print(response.text) 213 # 214 215 # 測試3,經過requests.get請求內的auth參數默認就是HTTPBasicAuth,認證成功返回用戶信息 216 # response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394')) 217 # print(response.text) 218 219 220 ''' 221 上傳文件 222 ''' 223 import requests 224 225 # 上傳文本文件 226 # files1 = {'file': open('user.txt', 'rb')} 227 # # files參數是POST請求固定參數 228 # response = requests.post('http://httpbin.org/post', files=files1) 229 # print(response.status_code) # 200 230 # print(response.text) # 200 231 232 # 上傳圖片文件 233 # files2 = {'jpg': open('一拳.jpg', 'rb')} 234 # response = requests.post('http://httpbin.org/post', files=files2) 235 # print(response.status_code) # 200 236 # print(response.text) # 200 237 # 238 # 上傳視頻文件 239 # files3 = {'movie': open('love_for_GD.mp4', 'rb')} 240 # response = requests.post('http://httpbin.org/post', files=files3) 241 # print(response.status_code) # 200 242 # print(response.text) # 200
selenium初級使用
1 '''''' 2 ''' 3 selenium模塊講解 4 一 什麼是selenium? 5 最初是一個自動化測試工具。能夠使用它幫咱們驅動瀏覽器 6 自動去執行某些自定義好的操做。例如在頁面中執行JS代碼、 7 跳過登陸驗證。能夠使用selenium幫咱們實現爬蟲。 8 9 二 爲何要使用selenium? 10 一、優勢: 11 使用requests模塊登陸須要分析大量的複雜通訊流程,使用selenium 12 能夠輕鬆跳過登陸驗證。 13 14 二、缺點: 15 瀏覽器會加載css、js、圖片、視頻...數據,爬蟲效率相比requests模塊要低。 16 17 三 如何使用selenium? 18 下載selenium模塊: 19 pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium 20 下載瀏覽器驅動: 21 http://npm.taobao.org/mirrors/chromedriver/2.38/ 22 ''' 23 24 # selenium之第一次 25 from selenium import webdriver # 用來驅動瀏覽器的 26 27 # 調用獲得一個動做鏈對象,破解滑動驗證碼的時候用的,能夠拖動圖片 28 from selenium.webdriver import ActionChains 29 30 # 按照什麼方式查找屬性,By.ID, By.CSS_SELECTOR, By.Class 31 from selenium.webdriver.common.by import By 32 33 from selenium.webdriver.common.keys import Keys # 鍵盤按鍵操做 34 35 # 和下面WebDriverWait一塊兒用的,EC是expected_conditions的別名 36 from selenium.webdriver.support import expected_conditions as EC 37 38 # 等待頁面加載某些元素 39 from selenium.webdriver.support.wait import WebDriverWait 40 import time 41 42 # 經過谷歌瀏覽器驅動打開谷歌瀏覽器 43 # webdriver.Chrome(r'chromedriver.exe的絕對路徑') 44 # chrome = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe') # 括號內輸入chromedriver.exe的絕對路徑 45 46 # chromedriver.exe存放於python解釋器的Scripts文件夾中 47 48 # chrome是一個驅動對象 49 chrome = webdriver.Chrome() 50 51 ''' 52 實例1 53 ''' 54 # 若try出現異常 55 # try: 56 # # 往tank博客主頁發送get請求 57 # # chrome.get('https://www.cnblogs.com/kermitjam/') 58 # 59 # # 參數1: 驅動對象 參數2: 等待時間 60 # wait = WebDriverWait(chrome, 10) 61 # 62 # # 一、訪問百度 63 # chrome.get('https://www.baidu.com/') 64 # 65 # # 二、查找input輸入框 66 # input_tag = wait.until( 67 # # 調用EC的presence_of_element_located() 68 # EC.presence_of_element_located( 69 # # 此處能夠寫一個元組 70 # # 參數1: 查找屬性的方式 71 # # 參數2: 屬性的名字 72 # (By.ID, "kw") 73 # ) 74 # ) 75 # input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw"))) 76 # 77 # # 三、搜索一拳超人 78 # input_tag.send_keys('一拳超人') 79 # 80 # # 四、按鍵盤迴車鍵 81 # input_tag.send_keys(Keys.ENTER) 82 # 83 # time.sleep(3) 84 # 85 # # 不管發生什麼都會關閉瀏覽器 86 # finally: 87 # # 關閉瀏覽器 88 # chrome.close() 89 90 91 ''' 92 實例2 93 ''' 94 try: 95 # 往tank博客主頁發送get請求 96 # chrome.get('https://www.cnblogs.com/kermitjam/') 97 98 # 參數1: 驅動對象 參數2: 等待時間 99 wait = WebDriverWait(chrome, 10) 100 101 # 一、訪問京東主頁 102 chrome.get('https://www.jd.com/') 103 104 # 二、查找input輸入框 105 input_tag = wait.until(EC.presence_of_element_located((By.ID, "key"))) 106 107 # 三、搜索唐詩三百首 108 input_tag.send_keys('唐詩三百首') 109 110 # 四、根據class屬性名稱查找標籤 111 search_button = wait.until( 112 EC.presence_of_element_located((By.CLASS_NAME, 'button'))) 113 # 五、點擊搜索按鈕 114 search_button.click() 115 116 time.sleep(3) 117 118 # 不管發生什麼都會關閉瀏覽器 119 finally: 120 # 關閉瀏覽器 121 chrome.close()
selenium之基本選擇器python
1 # from selenium import webdriver # 用來驅動瀏覽器的 2 # import time 3 # 4 # ''' 5 # 隱式等待 6 # ''' 7 # # 獲取驅動對象、 8 # driver = webdriver.Chrome() 9 # 10 # try: 11 # # 顯式等待: 等待某個元素加載 12 # # 參數1: 驅動對象 參數2: 等待時間 13 # # wait = WebDriverWait(chrome, 10) 14 # 15 # driver.get('https://china.nba.com/') 16 # 17 # # 隱式等待: 等待頁面全部元素加載 18 # driver.implicitly_wait(10) 19 # news_tag = driver.find_element_by_class_name('nav-news') 20 # # 獲取標籤對象 21 # print(news_tag) 22 # # 獲取標籤的名字 23 # print(news_tag.tag_name) 24 # 25 # 26 # time.sleep(10) 27 # 28 # finally: 29 # driver.close() 30 31 32 from selenium import webdriver # 用來驅動瀏覽器的 33 import time 34 35 ''' 36 ===============全部方法=================== 37 element是查找一個標籤 38 elements是查找全部標籤 39 40 一、find_element_by_link_text 經過連接文本去找 41 二、find_element_by_id 經過id去找 42 三、find_element_by_class_name 43 四、find_element_by_partial_link_text 44 五、find_element_by_name 45 六、find_element_by_css_selector 46 七、find_element_by_tag_name 47 ''' 48 # 獲取驅動對象、 49 driver = webdriver.Chrome() 50 51 try: 52 53 # 往百度發送請求 54 driver.get('https://www.baidu.com/') 55 driver.implicitly_wait(10) 56 57 # 一、find_element_by_link_text 經過連接文本去找 58 # 根據登陸 59 # send_tag = driver.find_element_by_link_text('登陸') 60 # send_tag.click() 61 62 # 二、find_element_by_partial_link_text 經過局部文本查找a標籤 63 login_button = driver.find_element_by_partial_link_text('登') 64 login_button.click() 65 time.sleep(1) 66 67 # 三、find_element_by_class_name 根據class屬性名查找 68 login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin') 69 login_tag.click() 70 time.sleep(1) 71 72 # 四、find_element_by_name 根據name屬性查找 73 username = driver.find_element_by_name('userName') 74 username.send_keys('15622792660') 75 time.sleep(1) 76 77 # 五、find_element_by_id 經過id屬性名查找 78 password = driver.find_element_by_id('TANGRAM__PSP_10__password') 79 password.send_keys('*******') 80 time.sleep(1) 81 82 # 六、find_element_by_css_selector 根據屬性選擇器查找 83 # 根據id查找登陸按鈕 84 login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit') 85 # driver.find_element_by_css_selector('.pass-button-submit') 86 login_submit.click() 87 88 # 七、find_element_by_tag_name 根據標籤名稱查找標籤 89 div = driver.find_element_by_tag_name('div') 90 print(div.tag_name) 91 92 time.sleep(10) 93 94 finally: 95 driver.close()