day5

上週做業css

 1 ''''''
 2 '''
 3 爬取豆瓣TOP250電影信息
 4 
 5 主頁:
 6     第一頁:
 7         https://movie.douban.com/top250?start=0&filter=
 8     第二頁:
 9         https://movie.douban.com/top250?start=25&filter=
10     第三頁:
11         https://movie.douban.com/top250?start=50&filter=
12     第四頁:
13         https://movie.douban.com/top250?start=75&filter=
14     第十頁:
15         https://movie.douban.com/top250?start=225&filter=
16         
17     GET
18     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
19 
20 re正則:
21     # 電影詳情頁url、圖片連接、電影名稱、導演、主演、電影上映時間、電影評分、評價人數、簡介
22    <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?導演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價.*?<span class="inq">(.*?)</span>
23 '''
24 import requests
25 import re
26 
27 
28 headers = {
29     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
30 }
31 base_url = 'https://movie.douban.com/top250?start={}&filter='
32 
33 n = 0
34 for line in range(10):
35     url = base_url.format(n)
36     print(type(n))
37     n += 25
38     print(url)
39 
40     # 一、往豆瓣TOP250發送請求獲取響應數據
41     response = requests.get(url, headers=headers)
42 
43     # print(response.text)
44 
45     # 二、經過正則解析提取數據
46     # 電影詳情頁url、圖片連接、電影名稱、電影評分、評價人數
47     movie_content_list = re.findall(
48         # 正則規則
49         # '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價',
50         '<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?導演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價.*?<span class="inq">(.*?)</span>',
51 
52         # 解析文本
53         response.text,
54 
55         # 匹配模式
56         re.S)
57 
58     for movie_content in movie_content_list:
59         # 解壓賦值每一部電影
60         detail_url, movie_jpg, name, daoyan, timer,point, num,  desc= movie_content
61         data = f'電影名稱:{name},   詳情頁url:{detail_url}, 圖片url:{movie_jpg}, 導演: {daoyan} 上映時間: {timer}評分: {point}, 評價人數: {num} 簡介:{desc}\n'
62         print(data)
63 
64         # 三、保存數據,把電影信息寫入文件中
65         with open('douban.txt', 'a', encoding='utf-8') as f:
66             f.write(data)

 

requests之post請求html

 1 ''''''
 2 '''
 3 post請求登錄github
 4 '''
 5 import requests
 6 import re
 7 
 8 # 一 訪問login頁獲取token信息
 9 '''
10 請求url:
11     https://github.com/login
12 請求方式:   
13     GET
14 響應頭:
15     Set-Cookie
16 請求頭:
17     Cookie
18     User-Agent
19 '''
20 headers = {
21     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
22 }
23 
24 response = requests.get(url='https://github.com/login', headers=headers)
25 # print(response.text)
26 # 把login頁返回的cookies信息轉換成字典
27 login_cookies = response.cookies.get_dict()
28 
29 authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)[0]
30 
31 print(authenticity_token)
32 
33 
34 
35 # 二 往sessionurl發送POST請求
36 '''
37 
38 請求url:
39     https://github.com/session
40     
41 請求方式:
42     POST
43     
44 請求頭:
45     # 上一次請求從哪裏來
46     Referer: https://github.com/login
47     Cookie:...
48     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
49     
50 請求體:
51     只有POST請求才會有請求體。
52     commit: Sign in
53     utf8: ✓
54     authenticity_token: 
55     VX79esFc0YPdR1UFzUM/6MTRZOlYQ0btF5k2/x7uZea0x2E6W4bmRpwHsaCBN+096PaWNkcQjJOsyUzUqsAhIw==
56     LLWlTr0qLcYC74hn7OI7IlyeB9rZei9737Lqtzz0sKTgY7Js7pUUhZ6bNC6lCkS+OHfVukkbTejjd0BnjPvGUg==
57     login: tankjam1
58     password: *****
59     webauthn-support: unsupported
60 '''
61 # 拼接請求頭信息
62 headers2 = {
63     'Referer': 'https://github.com/login',
64     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
65 }
66 
67 # 拼接請求體信息
68 form_data = {
69     "commit": "Sign in",
70     "utf8": "",
71     "authenticity_token": authenticity_token,
72     "login": "tankjam",
73     "password": "kermit46709394",
74     "webauthn-support": "unsupported",
75 }
76 
77 # 往session地址發送post請求
78 # 攜帶請求頭、請求體、login頁的cookies信息
79 response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies)
80 print(response2.status_code)
81 # print(response2.text)
82 with open('github.html', 'w', encoding='utf-8') as f:
83     f.write(response2.text)



requests響應
 1 # import requests
 2 #
 3 # response = requests.get('https://baidu.com')
 4 # # response響應
 5 # print(response.status_code)  # 獲取響應狀態碼
 6 # print(response.url)  # 獲取url地址
 7 # print(response.encoding)  # 字符編碼
 8 # response.encoding = 'utf-8'
 9 # print(response.text)  # 獲取文本
10 # print(response.content)  # 獲取二進制流
11 # print(response.headers)  # 獲取頁面請求頭信息
12 # print(response.history)  # 上一次跳轉的地址
13 # # 一、返回cookie字典 二、返回cookies對象
14 # print(response.cookies)  # 獲取cookies信息,
15 # print(response.cookies.get_dict())  # 獲取cookies信息轉換成字典
16 # print(response.cookies.items())  # 獲取cookies信息轉換成字典
17 # print(response.encoding)
18 # print(response.elapsed)  # 訪問時間
19 
20 # import requests
21 # # 往音頻地址發送get請求
22 # url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
23 # response = requests.get(url, stream=True)  # stream=True 把content設置爲一個迭代器對象
24 # print(response.content)
25 #
26 # with open('love_for_GD.mp4', 'wb') as f:
27 #     for content in response.iter_content():
28 #         f.write(content)




requests高級用法
  1 '''
  2 '''
  3 '''
  4 證書驗證(大部分網站都是https)
  5 '''
  6 import requests
  7 # # 若是是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端
  8 # response = requests.get('https://www.xiaohuar.com')
  9 # print(response.status_code)
 10 
 11 # 改進1:去掉報錯,可是會報警告
 12 # import requests
 13 # response = requests.get('https://www.xiaohuar.com', verify=False)
 14 # # 不驗證證書,報警告,返回200
 15 # print(response.status_code)
 16 
 17 # 改進2:去掉報錯,而且去掉警報信息
 18 # import requests
 19 # import urllib3
 20 # urllib3.disable_warnings()  # 關閉警告
 21 # response = requests.get('https://www.xiaohuar.com', verify=False)
 22 # print(response.status_code)
 23 
 24 # 改進3:加上證書
 25 # 不少網站都是https,可是不用證書也能夠訪問,大多數狀況都是能夠攜帶也能夠不攜帶證書
 26 # 知乎\百度等都是可帶可不帶
 27 # 有硬性要求的,則必須帶,好比對於定向的用戶,拿到證書後纔有權限訪問某個特定網站
 28 # import requests
 29 # import urllib3
 30 # # urllib3.disable_warnings()  # 關閉警告
 31 # # 僞代碼
 32 # response = requests.get(
 33 #     'https://www.xiaohuar.com',
 34 #     # verify=False,
 35 #     # /path/server.crt證書的存放目錄, /path/key
 36 #     cert=('/path/server.crt', '/path/key'))
 37 # print(response.status_code)
 38 
 39 
 40 '''
 41 超時設置
 42 '''
 43 
 44 # 超時設置
 45 # 兩種超時:float or tuple
 46 # timeout=0.1  # 表明接收數據的超時時間
 47 # timeout=(0.1,0.2)  # 0.1表明連接超時  0.2表明接收數據的超時時間
 48 
 49 # import requests
 50 # response = requests.get('https://www.baidu.com',
 51 #                         timeout=0.0001)
 52 # # print(response.elapsed)
 53 # print(response.status_code)
 54 
 55 '''
 56 代理設置:先發送請求給代理,而後由代理幫忙發送(封ip是常見的事情)
 57 '''
 58 # import requests
 59 # proxies={
 60 #     # 帶用戶名密碼的代理,@符號前是用戶名與密碼
 61 #     'http':'http://tank:123@localhost:9527',
 62 #     'http':'http://localhost:9527',
 63 #     'https':'https://localhost:9527',
 64 # }
 65 # response=requests.get('https://www.12306.cn',
 66 #                      proxies=proxies)
 67 #
 68 # print(response.status_code)
 69 '''
 70 爬取西刺免費代理:
 71     1.訪問西刺免費代理頁面
 72     2.經過re模塊解析並提取全部代理
 73     3.經過ip測試網站對爬取的代理進行測試
 74     4.若test_ip函數拋出異常表明代理做廢,不然代理有效
 75     5.利用有效的代理進行代理測試
 76 
 77 <tr class="odd">
 78       <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
 79       <td>112.85.131.99</td>
 80       <td>9999</td>
 81       <td>
 82         <a href="/2019-05-09/jiangsu">江蘇南通</a>
 83       </td>
 84       <td class="country">高匿</td>
 85       <td>HTTPS</td>
 86       <td class="country">
 87         <div title="0.144秒" class="bar">
 88           <div class="bar_inner fast" style="width:88%">
 89 
 90           </div>
 91         </div>
 92       </td>
 93       <td class="country">
 94         <div title="0.028秒" class="bar">
 95           <div class="bar_inner fast" style="width:97%">
 96 
 97           </div>
 98         </div>
 99       </td>
100 
101       <td>6天</td>
102       <td>19-05-16 11:20</td>
103     </tr>
104 re:
105     <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>
106 
107 '''
108 # import requests
109 # import re
110 # import time
111 #
112 # HEADERS = {
113 #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
114 # }
115 #
116 #
117 # def get_index(url):
118 #     time.sleep(1)
119 #     response = requests.get(url, headers=HEADERS)
120 #     return response
121 #
122 #
123 # def parse_index(text):
124 #     ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S)
125 #     for ip_port in ip_list:
126 #         ip = ':'.join(ip_port)
127 #         yield ip
128 #
129 # def test_ip(ip):
130 #     print('測試ip: %s' % ip)
131 #     try:
132 #         proxies = {
133 #             'https': ip
134 #         }
135 #
136 #         # ip測試網站
137 #         ip_url = 'https://www.ipip.net/'
138 #
139 #         # 使用有效與無效的代理對ip測試站點進行訪問,若返回的結果爲200則表明當前測試ip正常
140 #         response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1)
141 #
142 #         if response.status_code == 200:
143 #             print(f'有用的ip:{ip}')
144 #             return ip
145 #
146 #     # 若ip代理無效則拋出異常
147 #     except Exception as e:
148 #         print(e)
149 #
150 # # 使用代理爬取nba
151 # def spider_nba(good_ip):
152 #     url = 'https://china.nba.com/'
153 #
154 #     proxies = {
155 #         'https': good_ip
156 #     }
157 #
158 #     response = requests.get(url, headers=HEADERS, proxies=proxies)
159 #     print(response.status_code)
160 #     print(response.text)
161 #
162 #
163 # if __name__ == '__main__':
164 #     base_url = 'https://www.xicidaili.com/nn/{}'
165 #
166 #     for line in range(1, 3677):
167 #         ip_url = base_url.format(line)
168 #
169 #         response = get_index(ip_url)
170 #
171 #         # 解析西刺代理獲取每個ip列表
172 #         ip_list = parse_index(response.text)
173 #
174 #         # 循環每個ip
175 #         for ip in ip_list:
176 #             # print(ip)
177 #
178 #             # 對爬取下來的ip進行測試
179 #             good_ip = test_ip(ip)
180 #
181 #             if good_ip:
182 #                 # 真是代理,開始測試
183 #                 spider_nba(good_ip)
184 
185 
186 
187 '''
188 認證設置
189 '''
190 import requests
191 # 經過訪問github的api來測試
192 url = 'https://api.github.com/user'
193 HEADERS = {
194     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
195 }
196 
197 # 測試1,失敗返回401
198 # response = requests.get(url, headers=HEADERS)
199 # print(response.status_code)  # 401
200 # print(response.text)
201 '''
202 打印結果:
203     {
204       "message": "Requires authentication",
205       "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
206     }
207 '''
208 #
209 # # 測試2,經過requests.auth內的HTTPBasicAuth進行認證,認證成功返回用戶信息
210 # from requests.auth import HTTPBasicAuth
211 # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394'))
212 # print(response.text)
213 #
214 
215 # 測試3,經過requests.get請求內的auth參數默認就是HTTPBasicAuth,認證成功返回用戶信息
216 # response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394'))
217 # print(response.text)
218 
219 
220 '''
221 上傳文件
222 '''
223 import requests
224 
225 # 上傳文本文件
226 # files1 = {'file': open('user.txt', 'rb')}
227 # # files參數是POST請求固定參數
228 # response = requests.post('http://httpbin.org/post', files=files1)
229 # print(response.status_code)  # 200
230 # print(response.text)  # 200
231 
232 # 上傳圖片文件
233 # files2 = {'jpg': open('一拳.jpg', 'rb')}
234 # response = requests.post('http://httpbin.org/post', files=files2)
235 # print(response.status_code)  # 200
236 # print(response.text)  # 200
237 #
238 # 上傳視頻文件
239 # files3 = {'movie': open('love_for_GD.mp4', 'rb')}
240 # response = requests.post('http://httpbin.org/post', files=files3)
241 # print(response.status_code)  # 200
242 # print(response.text)  # 200





selenium初級使用
  1 ''''''
  2 '''
  3 selenium模塊講解
  4 一 什麼是selenium?
  5     最初是一個自動化測試工具。能夠使用它幫咱們驅動瀏覽器
  6     自動去執行某些自定義好的操做。例如在頁面中執行JS代碼、
  7     跳過登陸驗證。能夠使用selenium幫咱們實現爬蟲。
  8     
  9 二 爲何要使用selenium?
 10     一、優勢:
 11         使用requests模塊登陸須要分析大量的複雜通訊流程,使用selenium
 12     能夠輕鬆跳過登陸驗證。
 13     
 14     二、缺點:
 15         瀏覽器會加載css、js、圖片、視頻...數據,爬蟲效率相比requests模塊要低。
 16         
 17 三 如何使用selenium?
 18     下載selenium模塊:
 19         pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
 20     下載瀏覽器驅動:
 21         http://npm.taobao.org/mirrors/chromedriver/2.38/
 22 '''
 23 
 24 # selenium之第一次
 25 from selenium import webdriver  # 用來驅動瀏覽器的
 26 
 27 # 調用獲得一個動做鏈對象,破解滑動驗證碼的時候用的,能夠拖動圖片
 28 from selenium.webdriver import ActionChains
 29 
 30 # 按照什麼方式查找屬性,By.ID,  By.CSS_SELECTOR, By.Class
 31 from selenium.webdriver.common.by import By
 32 
 33 from selenium.webdriver.common.keys import Keys  # 鍵盤按鍵操做
 34 
 35 # 和下面WebDriverWait一塊兒用的,EC是expected_conditions的別名
 36 from selenium.webdriver.support import expected_conditions as EC
 37 
 38 # 等待頁面加載某些元素
 39 from selenium.webdriver.support.wait import WebDriverWait
 40 import time
 41 
 42 # 經過谷歌瀏覽器驅動打開谷歌瀏覽器
 43 # webdriver.Chrome(r'chromedriver.exe的絕對路徑')
 44 # chrome = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe')  # 括號內輸入chromedriver.exe的絕對路徑
 45 
 46 # chromedriver.exe存放於python解釋器的Scripts文件夾中
 47 
 48 # chrome是一個驅動對象
 49 chrome = webdriver.Chrome()
 50 
 51 '''
 52 實例1
 53 '''
 54 # 若try出現異常
 55 # try:
 56 #     # 往tank博客主頁發送get請求
 57 #     # chrome.get('https://www.cnblogs.com/kermitjam/')
 58 #
 59 #     # 參數1: 驅動對象  參數2: 等待時間
 60 #     wait = WebDriverWait(chrome, 10)
 61 #
 62 #     # 一、訪問百度
 63 #     chrome.get('https://www.baidu.com/')
 64 #
 65 #     # 二、查找input輸入框
 66 #     input_tag = wait.until(
 67 #         # 調用EC的presence_of_element_located()
 68 #         EC.presence_of_element_located(
 69 #             # 此處能夠寫一個元組
 70 #             # 參數1: 查找屬性的方式
 71 #             # 參數2: 屬性的名字
 72 #             (By.ID, "kw")
 73 #         )
 74 #     )
 75 #     input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw")))
 76 #
 77 #     # 三、搜索一拳超人
 78 #     input_tag.send_keys('一拳超人')
 79 #
 80 #     # 四、按鍵盤迴車鍵
 81 #     input_tag.send_keys(Keys.ENTER)
 82 #
 83 #     time.sleep(3)
 84 #
 85 # # 不管發生什麼都會關閉瀏覽器
 86 # finally:
 87 #     # 關閉瀏覽器
 88 #     chrome.close()
 89 
 90 
 91 '''
 92 實例2
 93 '''
 94 try:
 95     # 往tank博客主頁發送get請求
 96     # chrome.get('https://www.cnblogs.com/kermitjam/')
 97 
 98     # 參數1: 驅動對象  參數2: 等待時間
 99     wait = WebDriverWait(chrome, 10)
100 
101     # 一、訪問京東主頁
102     chrome.get('https://www.jd.com/')
103 
104     # 二、查找input輸入框
105     input_tag = wait.until(EC.presence_of_element_located((By.ID, "key")))
106 
107     # 三、搜索唐詩三百首
108     input_tag.send_keys('唐詩三百首')
109 
110     # 四、根據class屬性名稱查找標籤
111     search_button = wait.until(
112         EC.presence_of_element_located((By.CLASS_NAME, 'button')))
113     # 五、點擊搜索按鈕
114     search_button.click()
115 
116     time.sleep(3)
117 
118 # 不管發生什麼都會關閉瀏覽器
119 finally:
120     # 關閉瀏覽器
121     chrome.close()

 

selenium之基本選擇器python

 1 # from selenium import webdriver  # 用來驅動瀏覽器的
 2 # import time
 3 #
 4 # '''
 5 # 隱式等待
 6 # '''
 7 # # 獲取驅動對象、
 8 # driver = webdriver.Chrome()
 9 #
10 # try:
11 #     # 顯式等待: 等待某個元素加載
12 #     # 參數1: 驅動對象  參數2: 等待時間
13 #     # wait = WebDriverWait(chrome, 10)
14 #
15 #     driver.get('https://china.nba.com/')
16 #
17 #     # 隱式等待: 等待頁面全部元素加載
18 #     driver.implicitly_wait(10)
19 #     news_tag = driver.find_element_by_class_name('nav-news')
20 #     # 獲取標籤對象
21 #     print(news_tag)
22 #     # 獲取標籤的名字
23 #     print(news_tag.tag_name)
24 #
25 #
26 #     time.sleep(10)
27 #
28 # finally:
29 #     driver.close()
30 
31 
32 from selenium import webdriver  # 用來驅動瀏覽器的
33 import time
34 
35 '''
36 ===============全部方法===================
37     element是查找一個標籤
38     elements是查找全部標籤
39 
40     一、find_element_by_link_text  經過連接文本去找
41     二、find_element_by_id 經過id去找
42     三、find_element_by_class_name
43     四、find_element_by_partial_link_text
44     五、find_element_by_name
45     六、find_element_by_css_selector
46     七、find_element_by_tag_name
47 '''
48 # 獲取驅動對象、
49 driver = webdriver.Chrome()
50 
51 try:
52 
53     # 往百度發送請求
54     driver.get('https://www.baidu.com/')
55     driver.implicitly_wait(10)
56 
57     # 一、find_element_by_link_text  經過連接文本去找
58     # 根據登陸
59     # send_tag = driver.find_element_by_link_text('登陸')
60     # send_tag.click()
61 
62     # 二、find_element_by_partial_link_text 經過局部文本查找a標籤
63     login_button = driver.find_element_by_partial_link_text('')
64     login_button.click()
65     time.sleep(1)
66 
67     # 三、find_element_by_class_name 根據class屬性名查找
68     login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')
69     login_tag.click()
70     time.sleep(1)
71 
72     # 四、find_element_by_name 根據name屬性查找
73     username = driver.find_element_by_name('userName')
74     username.send_keys('15622792660')
75     time.sleep(1)
76 
77     # 五、find_element_by_id 經過id屬性名查找
78     password = driver.find_element_by_id('TANGRAM__PSP_10__password')
79     password.send_keys('*******')
80     time.sleep(1)
81 
82     # 六、find_element_by_css_selector  根據屬性選擇器查找
83     # 根據id查找登陸按鈕
84     login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')
85     # driver.find_element_by_css_selector('.pass-button-submit')
86     login_submit.click()
87 
88     # 七、find_element_by_tag_name  根據標籤名稱查找標籤
89     div = driver.find_element_by_tag_name('div')
90     print(div.tag_name)
91 
92     time.sleep(10)
93 
94 finally:
95     driver.close()
相關文章
相關標籤/搜索