僅僅列出我用到的,不全。html
劃重點:python
1. urllib2 用 urllib.request 代替cookie
2. urllib.urlencode 用 urllib.parse.urlencode 代替網絡
3. cookielib 用 http.cookiejar 代替jsp
4. print " " 用 print(" ") 代替post
5. urllib2.URLError 用 urllib.error.URLError 代替學習
6. urllib2.HTTPError 用 urllib.error.HTTPError 代替ui
7. except urllib2.URLError, e: 用 except urllib.error.URLError as e: 代替編碼
在python3.4.3自帶的IDLE中寫代碼,常常出現縮進錯誤,很難查找。url
解決方案:拷貝到Notepad++裏面,視圖中顯示空格和製表符,就能夠明顯看出問題在哪了。
設置了header的網絡請求,在Python2.x中的寫法
import urllib import urllib2 url = 'http://www.server.com/login' user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values = {'username' : 'kzy', 'password' : '123' } headers = { 'User-Agent' : user_agent } data = urllib.urlencode(values) request = urllib2.Request(url, data, headers) response = urllib2.urlopen(request) page = response.read()
在Python3.x中的寫法
import urllib.parse import urllib.request url = 'http://www.baidu.com' user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36' values = {'username':'kzy','password':'123'} headers = {'User-Agent':user_agent} data = urllib.parse.urlencode(values).encode(encoding='UTF8') #這裏要指明編碼方式 request = urllib.request.Request(url, data, headers) response = urllib.request.urlopen(request) page = response.read()
我在學習靜覓的爬蟲教程,照着把裏面的基礎部分的代碼都寫了一遍。
教程地址:http://cuiqingcai.com/1052.html
裏面本來的代碼都是2.x的,我所有用3.x學着寫了一遍。以下:
import urllib.parse import urllib.request """ response = urllib.request.urlopen("http://www.baidu.com") print(response.read()) """ """ #設置了header和data的請求 url = 'http://www.baidu.com' user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36' values = {'username':'kzy','password':'123'} headers = {'User-Agent':user_agent} data = urllib.parse.urlencode(values).encode(encoding='UTF8') request = urllib.request.Request(url, data, headers) response = urllib.request.urlopen(request) page = response.read() """ """ #設置代理 避免由於某個IP的訪問次數過多致使的禁止訪問 enable_proxy = True proxy_handler = urllib.request.ProxyHandler({"http":'http://some-proxy.com:8080'}) null_proxy_handler = urllib.request.ProxyHandler({}) if enable_proxy: opener = urllib.request.build_opener(proxy_handler) else: opener = urllib.request.build_opener(null_proxy_handler) urllib.request.install_opener(opener) """ """ #設置Timeout response = urllib.request.urlopen('http://www.baidu.com', timeout = 10) """ """ #使用http的 put或delete方法 url = 'http://www.baidu.com' request = urllib.request.Request(url, data=data) request.get_method = lambda:'PUT' #or 'DELETE' response = urllib.request.urlopen(request) """ """ #使用DebugLog 把收發包的內容在屏幕上打印出來 方便調試 httpHandler = urllib.request.HTTPHandler(debuglevel=1) httpsHandler = urllib.request.HTTPSHandler(debuglevel=1) opener = urllib.request.build_opener(httpHandler, httpsHandler) urllib.request.install_opener(opener) response = urllib.request.urlopen('https://its.pku.edu.cn/netportal/netportal_UTF-8.jsp', timeout = 5) """ """ #URLError異常處理 from urllib.error import URLError, HTTPError request = urllib.request.Request('http://www.baidu.com') try: urllib.request.urlopen(request, timeout = 5) except HTTPError as e: print('Error code:', e.code) except URLError as e: print('Reason:', e.reason) """ """ #URLError異常處理 屬性判斷 request = urllib.request.Request('https://its.pku.edu.cn/netportal/netportal_UTF-8.jsp') try: urllib.request.urlopen(request, timeout = 5) except urllib.error.URLError as e: if hasattr(e, "code"): #hasattr 判斷變量是否有某個屬性 print(e.code) if hasattr(e, "reason"): print(e.reason) else: print("OK") """ """ #獲取cookie保存到變量 import http.cookiejar #聲明一個CookieJar對象實例來保存cookie cookie = http.cookiejar.CookieJar() #利用HTTPCookieProcessor對象來建立cookie處理器 handler = urllib.request.HTTPCookieProcessor(cookie) #經過handler來構建opener opener = urllib.request.build_opener(handler) #此處的open方法同urlopen response = opener.open('https://its.pku.edu.cn/netportal/netportal_UTF-8.jsp') for item in cookie: print('Name = '+item.name) print('Value = '+item.value) """ """ #獲取cookie保存到文件 import http.cookiejar #設置保存的文件 filename = 'cookie.txt' #聲明一個MozillaCookieJar對象實例來保存cookie,以後寫入文件 cookie = http.cookiejar.MozillaCookieJar(filename) #建立cookie處理器 handler = urllib.request.HTTPCookieProcessor(cookie) #構建opener opener = urllib.request.build_opener(handler) response = opener.open("https://its.pku.edu.cn/netportal/netportal_UTF-8.jsp") #保存到cookie文件 cookie.save(ignore_discard=True,ignore_expires=True) """ """ #從文件中獲取cookie並訪問 import http.cookiejar #建立MozillaCookieJar實例對象 cookie = http.cookiejar.MozillaCookieJar() #從文件中讀取cookie內容到變量 cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True) #建立請求的request req = urllib.request.Request('https://its.pku.edu.cn/netportal/netportal_UTF-8.jsp') #建立opener opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) response = opener.open(req) print(response.read()) """ #模擬登錄 登錄不成功 import http.cookiejar filename = 'cookie.txt' cookie = http.cookiejar.MozillaCookieJar(filename) opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) postdata = urllib.parse.urlencode({'stuid':'******','pwd':'******'}).encode(encoding='UTF8') #這裏怎麼知道名字分別是stuid和pwd呢??? loginUrl = 'http://xxxxxx.com' result = opener.open(loginUrl, postdata) cookie.save(ignore_discard=True, ignore_expires=True) gradeUrl='http://xxxxxx.com' result = opener.open(gradeUrl) print(result.read())