1、urllib庫requesthtml
1 import urllib.request 2 3 url1 = "http://www.baidu.com" 4 image_url = 'https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality' \ 5 '=100&size=b4000_4000&sec=1561451694&di=4123b89e27e7f8d6091dfedc3e222d5a&src' \ 6 '=http://b-ssl.duitang.com/uploads/item/201711/01/20171101201000_UBjmK.jpeg' 7 8 '''方法一:urlopen''' 9 rep = urllib.request.urlopen(url=url1) #發送請求並接收響應 10 #print(rep) 11 #print(rep.read().decode()) #讀取內容,並將字節類型解碼爲字符串類型 12 # print(rep.geturl()) #獲取url 13 # print(rep.getheaders()) #獲取頭信息 14 # print(rep.getcode()) #獲取狀態碼 15 # print(rep.readlines()) #按行讀取 16 17 rep2 = urllib.request.urlopen(url=image_url) 18 # with open('gaolu.jpeg','wb') as fp: #保存讀取的內容到文件 19 # fp.write(rep2.read()) 20 21 '''方法二:urlretrieve''' 22 rep3 = urllib.request.urlretrieve(image_url,'gl.jpeg')
2、urllib庫parsejava
import urllib.parse '''方法一:quote和unquote''' url = 'http://www.baidu.com/index.html?name=狗蛋&pwd=123456' ret = urllib.parse.quote(url) #url編碼,轉換爲url規定字符 print(ret) ret2 = urllib.parse.unquote(ret) #url解碼 print(ret2) '''方法二:urlencode''' url2 = 'http://www.baidu.com/index.html' data = {'name':'狗蛋', 'age':15} '''手動拼接url''' lt = [] for k,v in data.items(): lt.append(k + '=' + str(v)) query_string = '&'.join(lt) print(query_string) url3 = url2 + '?' + query_string print(url) '''urlencode拼接url''' query_string2 = urllib.parse.urlencode(data) #將字典格式的數據轉爲url格式 print(query_string2)
3、request.Rquest建立請求對象app
import urllib.request import urllib.parse url1 = 'http://www.baidu.com/' '''定製請求頭,假裝本身——反'反爬'第一步''' headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=url1,headers=headers1) #構建請求對象 rep = urllib.request.urlopen(req) #print(rep.read().decode()) print(rep.getheaders())
4、post請求post
1 import urllib.request 2 import urllib.parse 3 4 post_url = 'https://fanyi.baidu.com/sug' 5 6 word = input('>>>:') 7 ''' 構建post表單數據''' 8 data = {'kw':word,} 9 form_data = urllib.parse.urlencode(data).encode() #將字典格式的數據轉爲url格式 10 11 headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 12 'AppleWebKit/537.36 (KHTML, like Gecko) ' 13 'Chrome/73.0.3683.86 Safari/537.36'} 14 15 req = urllib.request.Request(url=post_url,headers=headers1) 16 17 rep = urllib.request.urlopen(req,data = form_data) #發送post請求 18 19 ret = rep.read().decode() 20 21 print(ret)
5、Ajax-get請求ui
import urllib.request import urllib.parse '''豆瓣排行榜接口:https://movie.douban.com/j/chart/top_list?type' \ '=24&interval_id=100%3A90&action=&start=120&limit=20''' url = 'https://movie.douban.com/j/chart/top_list?type' \ '=24&interval_id=100%3A90&action=&' dic_data = {'start':4, 'limit':5} #定製參數 data = urllib.parse.urlencode(dic_data) url += data headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url=url,headers=headers1) rep = urllib.request.urlopen(req) print(rep.read().decode())
6、百度貼吧例子編碼
import urllib.request import urllib.parse import os '''獲取百度貼吧指定吧名和頁碼的帖子,並寫入指定文件''' url = 'http://tieba.baidu.com/f?ie=utf-8&' headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/73.0.3683.86 Safari/537.36'} ba_name = input("請輸入貼吧名:") start_page = int(input("請輸入起始頁碼:")) end_page = int(input("請輸入結束頁碼:")) if not os.path.exists(ba_name): os.mkdir(ba_name) for page in range(start_page,end_page+1): data = {'kw':ba_name, 'pn':'(page-1)*50',} #參數設置 form_data = urllib.parse.urlencode(data) #將字典格式的數據轉爲url格式 post_url = url + form_data #拼接url req = urllib.request.Request(url=post_url,headers=headers1) #建立請求對象 rep = urllib.request.urlopen(req) #發送請求 ret = rep.read() filename = str(page) + '.html' filepath = ba_name + '/' + filename with open(filepath,'wb') as fp: fp.write(ret)
7、urlErrorurl
import urllib.request import urllib.parse import urllib.error url = 'http://www.maodan.com' '''Exception:萬能的異常捕獲類''' # try: # rep = urllib.request.urlopen(url) # print(rep) # except Exception as e: # print(e) '''用URLError精確捕獲''' try: rep = urllib.request.urlopen(url) print(rep) except urllib.error.URLError as e: print(e)
8、HTTPErrorspa
import urllib.request
import urllib.parse
import urllib.error
url = 'https://www.cnblogs.com/java-chen-hao/p/1108374.html'
'''用URLError精確捕獲'''
try:
rep = urllib.request.urlopen(url)
print(rep)
except urllib.error.HTTPError as e:
print(e)
except urllib.error.URLError as e:
print(e)
'''HTTPError是URLError的子類,都能捕獲httpError。
二者同時捕獲時,HTTPError寫在前面。兒子先上,兒子不行父親上'''
9、handler與opener代理
import urllib.request import urllib.parse url = 'http://www.baidu.com/' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} '''建立一個HTTPhandler''' my_handler = urllib.request.HTTPHandler() '''經過handler建立一個opener''' my_opener = urllib.request.build_opener(my_handler) '''構建請求對象''' req = urllib.request.Request(url,headers=headers) '''發送請求''' rep = my_opener.open(req) print(rep.read().decode())
10、代碼配置代理code
import urllib.request import urllib.parse '''建立一個Proxyhandler''' my_handler = urllib.request.ProxyHandler({'http':'114.215.95.188:3128'}) my_opener = urllib.request.build_opener(my_handler) '''百度查詢IP地址的URL''' url = 'https://www.baidu.com/s?ie=utf-8&wd=IP' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} req = urllib.request.Request(url,headers=headers) rep = my_opener.open(req) with open('ip.html','wb') as fp: fp.write(rep.read())