依然採用IE的F12開發者工具分析抓取到的數據。
javascript
通常網站登陸成功後,跳轉方式主要有兩種:(1)服務器返回的響應頭中包含 location header,該header爲重定向地址,獲取該header內容,訪問便可。(2)服務器返回的響應內容中,包含使用javascript方法生成的重定向地址,使用正則表達式獲取window.location.replace("redirected URL")內容。css
然而,這兩種方式都不能獲取淘寶的重定向地址。經分析,想拿到淘寶中的我的數據要分三步:(1)淘寶登陸,獲取token值。(2)根據獲取的token值,獲得st值。(3)根據得到到的st值,獲取重定向地址。
html
得到重定向地址後,後面的事情就簡單多了。打開重定向地址,從返回的html信息中提取相應的地址信息便可。
java
下面紅線圈出的是比較重要的信息。須要仔細分析。第一個POST方法是提交登陸參數,返回參數中包含token值,那麼下面緊跟着的GET方法做用是什麼呢?還記得上面提到說要獲取淘寶我的數據分三步吧?沒錯!下面兩個分別是得到st值及重定向地址python
再來看看第一個GET方法的詳細信息,可看到傳遞的參數中有token值正則表達式
其響應信息以下,一段js腳本服務器
接着,看第二個GET請求的詳細信息,地址中包含剛剛獲得的st值及其餘參數值cookie
其響應以下,返回值包含一個urlapp
對比發現,與下面打開的url一致,即爲重定向地址。函數
# -*- coding:
# -*- coding: utf-8 -*- import urllib import urllib2 import cookielib import re #登陸地址 tbLoginUrl = "https://login.taobao.com/member/login.jhtml" checkCodeUrl = '' #post請求頭部 headers = { 'x-requestted-with': 'XMLHttpRequest', 'Accept-Language': 'zh-cn', 'Accept-Encoding': 'gzip, deflate', 'ContentType': 'application/x-www-form-urlencoded; chartset=UTF-8', 'Host': 'login.taobao.com', 'DNT': 1, 'Cache-Control': 'no-cache', 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1', 'Referer' : 'https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Fwww.taobao.com%2F', 'Connection' : 'Keep-Alive' } #用戶名,密碼 username = raw_input("Please input your username in taobao: ") password = raw_input("Please input your password of taobao: ") #請求數據包 postData = { 'TPL_username':username, 'TPL_password':password, "need_check_code" : "false", "loginsite": 0, "newlogin":1, 'TPL_redirect_url':'', 'from':'tbTop', 'fc':"default", 'style':'default', 'css_style':'', 'tid':'', 'support':'000001', 'CtrlVersion':'1,0,0,7', 'loginType':3, 'minititle':'', 'minipara' :'', "umto":"NAN", 'pstrong':2, 'llnick':'', 'sign':'', 'need_sign':'', "isIgnore":'', "full_redirect":'', 'popid':'', 'callback':'1', 'guf':'', 'not_duplite_str':'', 'need_user_id':'', 'poy':'', 'gvfdcname':10, 'from_encoding':'', "sub":'', "allp":'', 'action':'Authenticator', 'event_submit_do_login':'anything', 'longLogin':0 } #登陸主函數 def loginToTaobao(): #cookie 自動處理器 global checkCodeUrl cookieJar = cookielib.LWPCookieJar()#LWPCookieJar提供可讀寫操做的cookie文件,存儲cookie對象 cookieSupport= urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener(cookieSupport, urllib2.HTTPHandler) urllib2.install_opener(opener) #打開登錄頁面 taobao = urllib2.urlopen(tbLoginUrl) resp = taobao.read().decode("gbk") displayCookies(cookieJar) #提取驗證碼地址 pattern = r'img id="J_StandardCode_m" src="https://s.tbcdn.cn/apps/login/static/img/blank.gif" data-src="(\S*)"' checkCodeUrlList = re.findall(pattern, resp) checkCodeUrl = checkCodeUrlList[0] print "checkCodeUrl:", checkCodeUrl #此時直接發送post數據包登陸 result = sendPostData(tbLoginUrl, postData, headers)#此時默認不須要輸入驗證碼 print "result: ", result while(not result["state"]): print "failed to login in, error message: ",result["message"] if result["code"] == "3425" or result["code"] == "1000": getCheckCode(checkCodeUrl) result=sendPostData(tbLoginUrl, postData, headers) print "result: ", result print "successfully login in!" #獲取st值 url="https://passport.alipay.com/mini_apply_st.js?site=0" url=url+"&token="+result["token"]+"&callback=vstCallback519" text=urllib2.urlopen(url).read() print text displayCookies(cookieJar) st=re.search(r'"st":"(\S*)"( |})',text).group(1) print st #獲取重定向地址 myTaobaoUrl="http://login.taobao.com/member/vst.htm?" myTaobaoUrl=myTaobaoUrl+"st="+st+"&"+"TPL_uesrname=sunecho307" myTaobao = urllib2.urlopen(myTaobaoUrl) print myTaobao.read() displayCookies(cookieJar) def displayCookies(cookiejar): print "+"*20+"displayCookies"+"+"*20 for cookie in cookiejar: print cookie def sendPostData(url, data, header): print "+"*20+"sendPostData"+"+"*20 data = urllib.urlencode(data) request = urllib2.Request(url, data, header) response = urllib2.urlopen(request) #url = response.geturl() text = response.read().decode("gbk") info = response.info() status = response.getcode() response.close() print status print info print "Response:", text result = handleResponseText(text) return result def handleResponseText(text): """處理登陸返回結果""" print "+"*20+"handleResponseText"+"+"*20 text = text.replace(',', ' ') responseData = {"state": False, "message" : "", "code" : "", "token" : ""} m1 = re.match(r'\{?"state":(\w*)\ ', text) if m1 is not None: s = m1.group(1) if s == "true": responseData["state"] = True #提取token m4 = re.search(r'"token":"(\w*)"( |})', text) if m4 is not None: responseData["token"] = m4.group(1) else: m2 = re.search(r'"message":"(\S*)"( |})', text) if m2 is not None: msg = m2.group(1) responseData["message"] = msg else: print "failed to get the error message" m3 = re.match(r'.+\"code":(\w*)\ ', text) if m3 is not None: code = m3.group(1) responseData["code"] = code else: print "failed to get the error code" return responseData def getCheckCode(url): print "+"*20+"getCheckCode"+"+"*20 response = urllib2.urlopen(url) status = response.getcode() picData = response.read() path = "C:\\Users\\Echo\\Desktop\\checkcode.jepg" if status == 200: localPic = open(path, "wb") localPic.write(picData) localPic.close() print "請到%s,打開驗證碼圖片"%path checkCode = raw_input("請輸入驗證碼:") print checkCode, type(checkCode) postData["TPL_checkcode"] = checkCode postData["need_check_code"] = "true" else: print "failed to get Check Code, status:",status if __name__ == "__main__": loginToTaobao()
在前兩篇的基礎上,增長了一些功能:(1)驗證碼輸入錯誤,能夠重複輸入(2)獲取淘寶中的我的相關信息,目前還不完善,只能獲得登陸後頁面的html信息,相關信息還未提取,後面補充。