import urllib.request import urllib.parse import string def get_params(): url = "http://www.baidu.com/s?wd=" params = { "wd": "中文", "key": "zhang", "value": "san", } str_params = urllib.parse.urlencode(params) print(str_params) final_url = url + str_params #講帶有中文的url轉義成計算機能夠識別的URL end_url = urllib.parse.quote(final_url, safe=string.printable) response = urllib.request.urlopen(end_url) data = response.read().decode("utf-8") print(data) get_params()
解釋器ascii沒有漢字,url漢字轉碼html
urllib.parse.quote(params, safe=string.printable)
urllib.parse.urlencode()
POST請求:python
urllib.request.urlopen(url, data="服務器接收的數據")
(1)模擬真實的瀏覽器發送請求:百度批量搜索,檢察元素chrome
(2)request.add_header()動態添加head數據瀏覽器
(3)響應頭:response.header安全
(4)建立request:urllib.request.Request(url)服務器
import urllib.request def load_baidu(): url = "http://www.baidu.com/" response = urllib.request.urlopen(url) print(response) #響應頭 print(response.headers) load_baidu()
E:\python\python.exe H:/code/Python爬蟲/Day02/02-request_header.py <http.client.HTTPResponse object at 0x000001F64CC88CA0> Bdpagetype: 1 Bdqid: 0x9829fa7c000a56cf Cache-Control: private Content-Type: text/html;charset=utf-8 Date: Tue, 26 Jan 2021 06:35:11 GMT Expires: Tue, 26 Jan 2021 06:34:11 GMT P3p: CP=" OTI DSP COR IVA OUR IND COM " P3p: CP=" OTI DSP COR IVA OUR IND COM " Server: BWS/1.1 Set-Cookie: BAIDUID=A276C955F91E3B32F4D56ADC1EE37C59:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com Set-Cookie: BIDUPSID=A276C955F91E3B32F4D56ADC1EE37C59; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com Set-Cookie: PSTM=1611642911; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com Set-Cookie: BAIDUID=A276C955F91E3B3283913B84A5B12CFA:FG=1; max-age=31536000; expires=Wed, 26-Jan-22 06:35:11 GMT; domain=.baidu.com; path=/; version=1; comment=bd Set-Cookie: BDSVRTM=0; path=/ Set-Cookie: BD_HOME=1; path=/ Set-Cookie: H_PS_PSSID=33425_33507_33437_33257_33273_31253_33395_33398_33321_33265; path=/; domain=.baidu.com Traceid: 1611642911060665933810964570178293749455 Vary: Accept-Encoding Vary: Accept-Encoding X-Ua-Compatible: IE=Edge,chrome=1 Connection: close Transfer-Encoding: chunked Process finished with exit code 0
import urllib.request def load_baidu(): url = "http://www.baidu.com/" #建立請求對象 request = urllib.request.Request(url) #請求網絡數據 response = urllib.request.urlopen(request) #print(response) data = response.read().decode("utf-8") #響應頭 #print(response.headers) #獲取請求頭的信息 request_header = request.headers print(request_header) with open("02header.html", "w")as f: f.write(data) load_baidu()
E:\python\python.exe H:/code/Python爬蟲/Day02/03-request_header_two.py {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36', '3ch0 - nu1l': 's1mpL3'} Process finished with exit code 0
import urllib.request def load_baidu(): url = "http://www.baidu.com/" header = { #瀏覽器版本 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", "3cH0 - Nu1L": "s1mpL3", } #建立請求對象 request = urllib.request.Request(url, headers=header) #請求網絡數據(不在此處增長請求頭信息,由於此方法系統沒有提供參數) response = urllib.request.urlopen(request) data = response.read().decode("utf-8") #獲取請求頭的信息(全部頭的信息) #request_headers = request.headers #print(request_headers) #第二種方式打印headers信息 request_headers = request.get_header("User-agent") print(request_headers) with open("02header.html", "w")as f: f.write(data) load_baidu()
E:\python\python.exe H:/code/Python爬蟲/Day02/03-request_header_two.py Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36 Process finished with exit code 0
對比兩次返回值:網絡
使用內置函數時,不返回字典中的"3cH0 - Nu1L": "s1mpL3",dom
自行獲取時則都返回ide
中首字母要大寫(其他均小寫),若改成小寫,則返回值爲None函數
代碼:
import urllib.request def load_baidu(): url = "http://www.baidu.com/" header = { #瀏覽器版本 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", "3cH0 - Nu1L": "s1mpL3", } #建立請求對象 request = urllib.request.Request(url, headers=header) #請求網絡數據(不在此處增長請求頭信息,由於此方法系統沒有提供參數) response = urllib.request.urlopen(request) data = response.read().decode("utf-8") #獲取請求頭的信息(全部頭的信息) #request_headers = request.headers #print(request_headers) #第二種方式打印headers信息 request_headers = request.get_header("user-agent") print(request_headers) with open("02header.html", "w")as f: f.write(data) load_baidu()
返回:
E:\python\python.exe H:/code/Python爬蟲/Day02/03-request_header_two.py None Process finished with exit code 0
import urllib.request def load_baidu(): url = "http://www.baidu.com/" header = { # 瀏覽器版本 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", "3cH0 - Nu1L": "s1mpL3", } #建立請求對象 request = urllib.request.Request(url) #動態添加hander信息 request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36") #請求網絡數據 response = urllib.request.urlopen(request) #print(response) data = response.read().decode("utf-8") #響應頭 #print(response.headers) #獲取請求頭的信息 request_header = request.headers print(request_header) with open("02header.html", "w")as f: f.write(data) load_baidu()
E:\python\python.exe H:/code/Python爬蟲/Day02/03-request_header_two.py Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36 Process finished with exit code 0
import urllib.request def load_baidu(): url = "http://www.baidu.com/" header = { # 瀏覽器版本 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", "3cH0 - Nu1L": "s1mpL3", } #建立請求對象 request = urllib.request.Request(url) #動態添加hander信息 request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36") #請求網絡數據 response = urllib.request.urlopen(request) #print(response) data = response.read().decode("utf-8") #獲取完整的url final_url = request.get_full_url() print(final_url) load_baidu()
須要多份user-agent(網上搜索user-agent大全便可)
import urllib.request import random def load_baidu(): url = "http://www.baidu.com" user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50", ] #每次請求的瀏覽器都是不同的 random_user_agent = random.choice(user_agent_list) request = urllib.request.Request(url) #增長對應的響應頭(user-agent) request.add_header("User-Agent", random_user_agent) #請求數據 response = urllib.request.urlopen(request) #獲取請求頭的信息 print(request.get_header("User-agent")) load_baidu()
具備隨機性
(1)免費IP:時效性差,錯誤率高
(2)付費IP:也存在失效的
對方知道咱們的真實IP
對方不知道咱們的真實IP,知道咱們使用了代理
既不知道真實IP,也不知道使用了代理
系統的urlopen()不支持代理的添加
建立對應的處理器(handler)
import urllib.request def handler_openner(): #系統的urlopen沒有添加代理的功能,須要咱們自定義該功能 #安全 套接層 ssl第三方的CA數字證書 #http:80 #https:443 #urlopen爲何能夠請求數據: #①handler處理器, #②本身的openner請求數據 url = "https://www.cnblogs.com/3cH0-Nu1L/" #建立本身的處理器 handler = urllib.request.HTTPHandler #建立本身的oppener openner = urllib.request.build_opener(handler) #用本身建立的openner調用open方法請求數據 response = openner.open(url) data = response.read() print(response) print(data) handler_openner()
HTTPHandler()不能夠增長代理
import urllib.request def create_proxy_handler(): url = "https://www.cnblogs.com/3cH0-Nu1L/" #添加代理 proxy = { #免費的寫法 "http": "104.131.109.66:8080" } #代理處理器 proxy_handler = urllib.request.ProxyHandler(proxy) #建立本身的openner openner = urllib.request.build_opener(proxy_handler) #拿着代理IP發送請求 data = openner.open(url).read() print(data) create_proxy_handler()
import urllib.request def proxy_user(): proxy_list = [ {"http": "104.131.109.66:8080"}, {"http": "88.198.24.108:8080"}, {"http": "96.113.165.182:3128"}, {"http": "117.185.17.151:80"}, {"http": "112.30.164.18:80"}, ] for proxy in proxy_list: print(proxy) #利用遍歷出來的IP建立處理器 proxy_handler = urllib.request.ProxyHandler(proxy) #建立openner openner = urllib.request.build_opener(proxy_handler) try: openner.open("http://www.baidu.com", timeout=1) print("s1mpL3") except Exception as e: print(e) proxy_user()
E:\python\python.exe H:/code/Python爬蟲/Day02/07-random-user-proxy.py {'http': '104.131.109.66:8080'} s1mpL3 {'http': '88.198.24.108:8080'} <urlopen error timed out> {'http': '96.113.165.182:3128'} s1mpL3 {'http': '117.185.17.151:80'} s1mpL3 {'http': '112.30.164.18:80'} <urlopen error timed out> Process finished with exit code 0
import urllib.request import requests #付費代理髮送 #1.用戶名密碼(帶着) #經過驗證的處理起來發送 def money_proxy_user(): #1.代理IP money_proxy = { "http": "username:passwd@192.168.12.1:8080" } #2.代理的處理器 proxy_handler = urllib.request.ProxyHandler(money_proxy) #3.經過處理器建立openner openner = urllib.request.build_opener(proxy_handler) #4.open發送請求 openner.open("http://www.baidu.com/") money_proxy_user()
import urllib.request import requests #付費代理髮送 #1.用戶名密碼(帶着) #經過驗證的處理起來發送 def money_proxy_user(): # 第二種方式發送付費的IP地址 user_name = "abcname" passwd = "123456" proxy_money = "123.158.62.120:8080" # 2.建立密碼管理器,添加用戶名和密碼 password_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() #uri定位 uri > url #url:資源定位符 password_manager.add_password(None, proxy_money, user_name, passwd) # 3.建立能夠驗證代理IP的處理器 handler_auth_proxy = urllib.request.ProxyBasicAuthHandler(password_manager) # 4.根據處理器建立openner openner_auth = urllib.request.build_opener(handler_auth_proxy) # 5.發送請求 response = openner_auth.open("http:www.baidu.com") print(response.read()) money_proxy_user()
爬取本身網站的數據進行分析,相似使用付費代理IP的過程。
import urllib.request def auth_neiwang(): # 1.用戶名密碼 user = "admin" password ="admin123" nei_url = "http://192.168.179.66" # 2.建立密碼管理器 pwd_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() pwd_manager.add_password(None, nei_url, user, password) # 3.建立認證處理器(requests) auth_handler = urllib.request.HTTPBasicAuthHandler(pwd_manager) openner = urllib.request.build_opener(auth_handler) response = openner.open(nei_url) print(response) auth_neiwang()