3.0版本中已經將urllib二、urlparse、和robotparser併入了urllib中,而且修改urllib模塊,其中包含5個子模塊,便是help()中看到的那五個名字。php
Python2中的urllib模塊,在Python3中被修改成html
20.5. urllib.request — Extensible library for opening URLs 20.6. urllib.response — Response classes used by urllib 20.7. urllib.parse — Parse URLs into components 20.8. urllib.error — Exception classes raised by urllib.request 20.9. urllib.robotparser — Parser for robots.txt
這幾個模塊,經常使用的urllib.urlopen()方法變成了urllib.request.urlopen()方法,其它方法的改變,能夠參考Python3的文檔python
Python3文檔的互聯網協議與支持部分:http://docs.python.org/py3k/library/internet.htmlweb
Python2使用庫:ajax
urllib http://docs.python.org/library/urllib.html【下載】cookie
urllib2 http://docs.python.org/library/urllib2.html【抓取】多線程
urlparse http://docs.python.org/library/urlparse.html【url切分用到】app
sgmllib http://docs.python.org/library/sgmllib.html【html解析用到】dom
# Python urllib2遞歸抓取某個網站下圖片
#!/usr/bin/python # -*- coding:utf-8 -*- # author: wklken # 2012-03-17 wklken@yeah.net #1實現url解析 #2實現圖片下載 #3優化重構 #4多線程 還沒有加入 import os,sys,urllib,urllib2,urlparse from sgmllib import SGMLParser img = [] class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls=[] self.imgs=[] def start_a(self, attrs): href = [ v for k,v in attrs if k=="href" and v.startswith("http")] if href: self.urls.extend(href) def start_img(self, attrs): src = [ v for k,v in attrs if k=="src" and v.startswith("http") ] if src: self.imgs.extend(src) def get_url_of_page(url, if_img = False): urls = [] try: f = urllib2.urlopen(url, timeout=1).read() url_listen = URLLister() url_listen.feed(f) if if_img: urls.extend(url_listen.imgs) else: urls.extend(url_listen.urls) except urllib2.URLError, e: print e.reason return urls #遞歸處理頁面 def get_page_html(begin_url, depth, ignore_outer, main_site_domain): #如果設置排除外站 過濾之 if ignore_outer: if not main_site_domain in begin_url: return if depth == 1: urls = get_url_of_page(begin_url, True) img.extend(urls) else: urls = get_url_of_page(begin_url) if urls: for url in urls: get_page_html(url, depth-1) #下載圖片 def download_img(save_path, min_size): print "download begin..." for im in img: filename = im.split("/")[-1] dist = os.path.join(save_path, filename) #此方式判斷圖片的大小太浪費了 #if len(urllib2.urlopen(im).read()) < min_size: # continue #這種方式先拉頭部,應該好多了,不用再下載一次 connection = urllib2.build_opener().open(urllib2.Request(im)) if int(connection.headers.dict['content-length']) < min_size: continue urllib.urlretrieve(im, dist,None) print "Done: ", filename print "download end..." if __name__ == "__main__": #抓取圖片首個頁面 url = "http://www.baidu.com/" #圖片保存路徑 save_path = os.path.abspath("./downlaod") if not os.path.exists(save_path): os.mkdir(save_path) #限制圖片最小必須大於此域值 單位 B min_size = 92 #遍歷深度 max_depth = 1 #是否只遍歷目標站內,即存在外站是否忽略 ignore_outer = True main_site_domain = urlparse.urlsplit(url).netloc get_page_html(url, max_depth, ignore_outer, main_site_domain) download_img(save_path, min_size)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import time import sys import gzip import socket import urllib.request, urllib.parse, urllib.error import http.cookiejar class HttpTester: def __init__(self, timeout=10, addHeaders=True): socket.setdefaulttimeout(timeout) # 設置超時時間 self.__opener = urllib.request.build_opener() urllib.request.install_opener(self.__opener) if addHeaders: self.__addHeaders() def __error(self, e): '''錯誤處理''' print(e) def __addHeaders(self): '''添加默認的 headers.''' self.__opener.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'), ('Connection', 'keep-alive'), ('Cache-Control', 'no-cache'), ('Accept-Language:', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3'), ('Accept-Encoding', 'gzip, deflate'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')] def __decode(self, webPage, charset): '''gzip解壓,並根據指定的編碼解碼網頁''' if webPage.startswith(b'x1fx8b'): return gzip.decompress(webPage).decode(charset) else: return webPage.decode(charset) def addCookiejar(self): '''爲 self.__opener 添加 cookiejar handler。''' cj = http.cookiejar.CookieJar() self.__opener.add_handler(urllib.request.HTTPCookieProcessor(cj)) def addProxy(self, host, type='http'): '''設置代理''' proxy = urllib.request.ProxyHandler({type: host}) self.__opener.add_handler(proxy) def addAuth(self, url, user, pwd): '''添加認證''' pwdMsg = urllib.request.HTTPPasswordMgrWithDefaultRealm() pwdMsg.add_password(None, url, user, pwd) auth = urllib.request.HTTPBasicAuthHandler(pwdMsg) self.__opener.add_handler(auth) def get(self, url, params={}, headers={}, charset='UTF-8'): '''HTTP GET 方法''' if params: url += '?' + urllib.parse.urlencode(params) request = urllib.request.Request(url) for k,v in headers.items(): request.add_header(k, v) # 爲特定的 request 添加指定的 headers try: response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: self.__error(e) else: return self.__decode(response.read(), charset)
def post(self, url, params={}, headers={}, charset='UTF-8'): '''HTTP POST 方法''' params = urllib.parse.urlencode(params) request = urllib.request.Request(url, data=params.encode(charset)) # 帶 data 參數的 request 被認爲是 POST 方法。 for k,v in headers.items(): request.add_header(k, v) try: response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: self.__error(e) else: return self.__decode(response.read(), charset) def download(self, url, savefile): '''下載文件或網頁''' header_gzip = None for header in self.__opener.addheaders: # 移除支持 gzip 壓縮的 header if 'Accept-Encoding' in header: header_gzip = header self.__opener.addheaders.remove(header) __perLen = 0
def reporthook(a, b, c): # a:已經下載的數據大小; b:數據大小; c:遠程文件大小; if c > 1000000: nonlocal __perLen per = (100.0 * a * b) / c if per>100: per=100 per = '{:.2f}%'.format(per) print('b'*__perLen, per, end='') # 打印下載進度百分比 sys.stdout.flush() __perLen = len(per)+1 print('--> {}t'.format(url), end='') try: urllib.request.urlretrieve(url, savefile, reporthook) # reporthook 爲回調鉤子函數,用於顯示下載進度 except urllib.error.HTTPError as e: self.__error(e) finally: self.__opener.addheaders.append(header_gzip) print() 2、應用實例 在OSC上動彈一下 ht = HttpTester() ht.addCookiejar() # 爲了隱私,把有些關鍵字隱藏了 ht.get('https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F') ht.post(url = 'https://www.oschina.net/action/user/hash_login', params = {'email': '****@foxmail.com','pwd': 'e4a1425583d37fcd33b9*************','save_login': '1'})#密碼哈希,Firefox開發工具抓取的
ht.get('http://www.oschina.net/') ht.post(url = 'http://www.oschina.net/action/tweet/pub', params = {'user_code': '8VZTqhkJOqhnuugHvzBtME4***********','user': '102*****','msg': '你們在動彈什麼? via:(python3, urllib) ->{t}'.format(t = time.time())}) 金山快盤簽到送空間 ht = HttpTester() ht.addCookiejar() # 爲了隱私,把有些關鍵字隱藏 ht.get('https://www.kuaipan.cn/account_login.htm') ht.post(url='https://www.kuaipan.cn/index.php?ac=account&op=login',params={'username': '****@qq.com','userpwd': 'lyb********','isajax': 'yes'}) ht.get('http://www.kuaipan.cn/index.php?ac=zone&op=taskdetail') ht.get('http://www.kuaipan.cn/index.php?ac=common&op=usersign')