urllib最經常使用的兩大功能(我的理解urllib用於輔助urllib2)html
1.urllib.urlopen()python
2. urllib.urlencode() #適當的編碼,可用於後面的post提交數據cookie
import urllib Dict = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python'} print urllib.urlencode(Dict)
urllib2經常使用的函數app
1.最基本的打開讀取一個網頁函數
import urllib2 response = urllib2.urlopen('http://www.baidu.com/') html = response.read()
2.地址建立一個Request對象post
req = urllib2.Request('http://www.baidu.com/') response = urllib2.urlopen(req) the_page = response.read()
3.Data數據利用post方式提交網站
value={'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python'} data = urllib.urlencode(values) request = urllib2.Request(url,data) #request= urllib2.Request(url, data, headers) Request對象共有三個參數 response = urllib2.urlopen(request) print response.read()
4.在 HTTP Request 中加入特定的 Headerui
import urllib2 request = urllib2.Request('http://www.baidu.com/') request.add_header('User-Agent', 'fake-client') response = urllib2.urlopen(request) print response.read()
5.Cookiegoogle
import urllib2 import cookielib cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) response = opener.open('http://www.baidu.com') for item in cookie: print 'Name = '+item.name print 'Value = '+item.value
6.獲得 HTTP 的返回碼編碼
import urllib2 try: response = urllib2.urlopen('http://bbs.csdn.net/why') except urllib2.HTTPError, e: print e.code
7.Timeout 設置
import urllib2 response = urllib2.urlopen('http://www.baidu.com/', timeout=10)
8.Redirect動做
import urllib2 my_url = 'http://www.google.cn' response = urllib2.urlopen(my_url) redirected = response.geturl() == my_url print redirected my_url = 'http://rrurl.cn/b1UZuP' response = urllib2.urlopen(my_url) redirected = response.geturl() == my_url print redirected
9.使用 HTTP 的 PUT 和 DELETE 方法
import urllib2 request = urllib2.Request(uri, data=data) request.get_method = lambda: 'PUT' # or 'DELETE' response = urllib2.urlopen(request)
10.Debug Log
import urllib2 httpHandler = urllib2.HTTPHandler(debuglevel=1) httpsHandler = urllib2.HTTPSHandler(debuglevel=1) opener = urllib2.build_opener(httpHandler, httpsHandler) urllib2.install_opener(opener) response = urllib2.urlopen('http://www.google.com')
11.表單的處理
# -*- coding: utf-8 -*- import urllib import urllib2 postdata=urllib.urlencode({ 'username':'汪小光', 'password':'why888', 'continueURI':'http://www.verycd.com/', 'fk':'', 'login_submit':'登陸' }) req = urllib2.Request( url = 'http://secure.verycd.com/signin', data = postdata ) result = urllib2.urlopen(req) print result.read()
最後附上一段抓取某網站妹子圖片的代碼
import urllib import urllib2 import os def url_open(url): req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0') response = urllib2.urlopen(req) html = response.read() return html def get_page(url): html = url_open(url).decode('utf-8') a = html.find('current-comment-page') + 23 b = html.find(']', a) return html[a:b] def find_imgs(url): html = url_open(url).decode('utf-8') img_addrs = [] a = html.find('img src=') while a != -1: b = html.find('.jpg', a, a+255) if b != -1: img_addrs.append(html[a+9:b+4]) else: b = a + 9 a = html.find('img src=', b) return img_addrs def save_imgs(folder, img_addrs): for each in img_addrs: filename = each.split('/')[-1] with open(filename, 'wb') as f: img = url_open(each) f.write(img) def download_mm(folder='OOXX', pages=10): os.mkdir(folder) os.chdir(folder) url = "http://jandan.net/ooxx/" page_num = int(get_page(url)) for i in range(pages): page_num -= i page_url = url + 'page-' + str(page_num) + '#comments' img_addrs = find_imgs(page_url) save_imgs(folder, img_addrs) if __name__ == '__main__': download_mm()