反反爬蟲系列將會陸續的介紹一些爬蟲的高級玩法,主要就是分析網站的加密方式,經過python代碼實現並爬取數據!
OK,今天要分析的網站爲同程網,獲取其酒店的用戶評論,評論信息經過JSON返回的API。頁面大概長這個樣子。html
抓包調試一下~F12卻出現這個蛋疼的東西,對chrome控制檯動了手腳。前端
很明顯的就能發現是這段JS在搞鬼python
沒辦法ctrl+s保存html到本地。將這段JS找到把裏面的函數刪除掉便可ajax
這裏有坑的,這些JS文件都是壓縮過的,因此務必要保持結構的完整性。在用sublime打開的時候,刪除這段函數的內容便可,不要視圖格式化這個JS文件。chrome
OK!這個時候再打開本地保存的html文件,F12就能夠進行抓包分析了。(固然了你和我同樣用fiddler抓包的話,上面的就至關於白說。對於這種反爬對控制檯動手腳的網站,建議用fiddler進行抓包分析!)cookie
headers = { 'Host': 'www.ly.com', "Referer": "https://www.ly.com/HotelInfo-92515879.html", 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/71.0.3578.98 Safari/537.36', } params = { 'hotelid': "92515879", 'page': "1", 'pageSize': '10', 'commentType': '0', 'roomTypeId': '', 'tripPurposeId': '', 'RankType': '1', 'mainTagId': '', 'subTagId': '', 'antitoken': "54fc51cc98d934d4b2e054c58ca905f6", }
很明顯就能發現一個關鍵參數antitoken。接下來就是獲取這個antitoken。搜索一下antitoken關鍵字。session
搜索出來的結果都指向last.js文件,那就沒啥好說的觀察這個文件裏面的函數。將這個文件所有複製下,而後在Sourses裏新建一個Snippets,將代碼複製進去,繼續搜索antitoken。dom
這就是一段ajax代碼,從cookie中獲取「wangba」,在cookie中找到wangba,發現他是一個相似於時間戳的東西,後面其實也介紹到了,e其實就是一個時間戳。那就好說了,直接模擬。ide
e=(new Date).getTime().toString()
直接在snippet中修改,而後在debug調試下JS找到antitoken的生成函數。兩步找到函數~函數
function(e, t, a) { var n, i, o, s, r; n = a(29), i = a(12).utf8, o = a(30), s = a(12).bin, (r = function(e, t) { e.constructor == String ? e = t && "binary" === t.encoding ? s.stringToBytes(e) : i.stringToBytes(e) : o(e) ? e = Array.prototype.slice.call(e, 0) : Array.isArray(e) || (e = e.toString()); for (var a = n.bytesToWords(e), l = 8 * e.length, c = 1732584193, d = -271733879, p = -1732584194, u = 271733878, m = 0; m < a.length; m++) a[m] = 16711935 & (a[m] << 8 | a[m] >>> 24) | 4278255360 & (a[m] << 24 | a[m] >>> 8); a[l >>> 5] |= 128 << l % 32, a[14 + (l + 64 >>> 9 << 4)] = l; var f = r._ff , h = r._gg , v = r._hh , g = r._ii; for (m = 0; m < a.length; m += 16) { var y = c , _ = d , b = p , $ = u; d = g(d = g(d = g(d = g(d = v(d = v(d = v(d = v(d = h(d = h(d = h(d = h(d = f(d = f(d = f(d = f(d, p = f(p, u = f(u, c = f(c, d, p, u, a[m + 0], 7, -680876936), d, p, a[m + 1], 12, -389564586), c, d, a[m + 2], 17, 606105819), u, c, a[m + 3], 22, -1044525330), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 4], 7, -176418897), d, p, a[m + 5], 12, 1200080426), c, d, a[m + 6], 17, -1473231341), u, c, a[m + 7], 22, -45705983), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 8], 7, 1770035416), d, p, a[m + 9], 12, -1958414417), c, d, a[m + 10], 17, -42063), u, c, a[m + 11], 22, -1990404162), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 12], 7, 1804603682), d, p, a[m + 13], 12, -40341101), c, d, a[m + 14], 17, -1502002290), u, c, a[m + 15], 22, 1236535329), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 1], 5, -165796510), d, p, a[m + 6], 9, -1069501632), c, d, a[m + 11], 14, 643717713), u, c, a[m + 0], 20, -373897302), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 5], 5, -701558691), d, p, a[m + 10], 9, 38016083), c, d, a[m + 15], 14, -660478335), u, c, a[m + 4], 20, -405537848), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 9], 5, 568446438), d, p, a[m + 14], 9, -1019803690), c, d, a[m + 3], 14, -187363961), u, c, a[m + 8], 20, 1163531501), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 13], 5, -1444681467), d, p, a[m + 2], 9, -51403784), c, d, a[m + 7], 14, 1735328473), u, c, a[m + 12], 20, -1926607734), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 5], 4, -378558), d, p, a[m + 8], 11, -2022574463), c, d, a[m + 11], 16, 1839030562), u, c, a[m + 14], 23, -35309556), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 1], 4, -1530992060), d, p, a[m + 4], 11, 1272893353), c, d, a[m + 7], 16, -155497632), u, c, a[m + 10], 23, -1094730640), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 13], 4, 681279174), d, p, a[m + 0], 11, -358537222), c, d, a[m + 3], 16, -722521979), u, c, a[m + 6], 23, 76029189), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 9], 4, -640364487), d, p, a[m + 12], 11, -421815835), c, d, a[m + 15], 16, 530742520), u, c, a[m + 2], 23, -995338651), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 0], 6, -198630844), d, p, a[m + 7], 10, 1126891415), c, d, a[m + 14], 15, -1416354905), u, c, a[m + 5], 21, -57434055), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 12], 6, 1700485571), d, p, a[m + 3], 10, -1894986606), c, d, a[m + 10], 15, -1051523), u, c, a[m + 1], 21, -2054922799), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 8], 6, 1873313359), d, p, a[m + 15], 10, -30611744), c, d, a[m + 6], 15, -1560198380), u, c, a[m + 13], 21, 1309151649), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 4], 6, -145523070), d, p, a[m + 11], 10, -1120210379), c, d, a[m + 2], 15, 718787259), u, c, a[m + 9], 21, -343485551), c = c + y >>> 0, d = d + _ >>> 0, p = p + b >>> 0, u = u + $ >>> 0 } return n.endian([c, d, p, u]) } )._ff = function(e, t, a, n, i, o, s) { var r = e + (t & a | ~t & n) + (i >>> 0) + s; return (r << o | r >>> 32 - o) + t } , r._gg = function(e, t, a, n, i, o, s) { var r = e + (t & n | a & ~n) + (i >>> 0) + s; return (r << o | r >>> 32 - o) + t } , r._hh = function(e, t, a, n, i, o, s) { var r = e + (t ^ a ^ n) + (i >>> 0) + s; return (r << o | r >>> 32 - o) + t } , r._ii = function(e, t, a, n, i, o, s) { var r = e + (a ^ (t | ~n)) + (i >>> 0) + s; return (r << o | r >>> 32 - o) + t } , r._blocksize = 16, r._digestsize = 16, e.exports = function(e, t) { if (e === undefined || null === e) throw new Error("Illegal argument " + e); var a = n.wordsToBytes(r(e, t)); return t && t.asBytes ? a : t && t.asString ? s.bytesToString(a) : n.bytesToHex(a) } }
這個函數接收e,t,a三個參數,最後返回t對象,定義了一些變量,var n, i, o, s, r; 就是根據這些變量返回t對象,調用t.getantitoken從而獲取到antitoken值。OK思路有了接下來就是僞造這些n, i, o, s, 那麼如何進行僞造咧,很簡單。繼續調試,找到這幾個變量的生成函數。
這幾個參數都是由a這個對象生成的。找a
將以前的斷點取消,在n出打斷點,運行程序。而後控制檯輸入a(29)果真獲得一個函數~
function(e, t) { var a, n; a = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", n = { rotl: function(e, t) { return e << t | e >>> 32 - t }, rotr: function(e, t) { return e << 32 - t | e >>> t }, endian: function(e) { if (e.constructor == Number) return 16711935 & n.rotl(e, 8) | 4278255360 & n.rotl(e, 24); for (var t = 0; t < e.length; t++) e[t] = n.endian(e[t]); return e }, randomBytes: function(e) { for (var t = []; e > 0; e--) t.push(Math.floor(256 * Math.random())); return t }, bytesToWords: function(e) { for (var t = [], a = 0, n = 0; a < e.length; a++, n += 8) t[n >>> 5] |= e[a] << 24 - n % 32; return t }, wordsToBytes: function(e) { for (var t = [], a = 0; a < 32 * e.length; a += 8) t.push(e[a >>> 5] >>> 24 - a % 32 & 255); return t }, bytesToHex: function(e) { for (var t = [], a = 0; a < e.length; a++) t.push((e[a] >>> 4).toString(16)), t.push((15 & e[a]).toString(16)); return t.join("") }, hexToBytes: function(e) { for (var t = [], a = 0; a < e.length; a += 2) t.push(parseInt(e.substr(a, 2), 16)); return t }, bytesToBase64: function(e) { for (var t = [], n = 0; n < e.length; n += 3) for (var i = e[n] << 16 | e[n + 1] << 8 | e[n + 2], o = 0; o < 4; o++) 8 * n + 6 * o <= 8 * e.length ? t.push(a.charAt(i >>> 6 * (3 - o) & 63)) : t.push("="); return t.join("") }, base64ToBytes: function(e) { e = e.replace(/[^A-Z0-9+\/]/gi, ""); for (var t = [], n = 0, i = 0; n < e.length; i = ++n % 4) 0 != i && t.push((a.indexOf(e.charAt(n - 1)) & Math.pow(2, -2 * i + 8) - 1) << 2 * i | a.indexOf(e.charAt(n)) >>> 6 - 2 * i); return t } }, e.exports = n }
再次輸入a(12)
function(e, t) { var a = { utf8: { stringToBytes: function(e) { return a.bin.stringToBytes(unescape(encodeURIComponent(e))) }, bytesToString: function(e) { return decodeURIComponent(escape(a.bin.bytesToString(e))) } }, bin: { stringToBytes: function(e) { for (var t = [], a = 0; a < e.length; a++) t.push(255 & e.charCodeAt(a)); return t }, bytesToString: function(e) { for (var t = [], a = 0; a < e.length; a++) t.push(String.fromCharCode(e[a])); return t.join("") } } }; e.exports = a }
a(30) = Null
因此n、i、o、s這幾個參數都找到了,那麼就是構建antitoken函數,帶上關鍵參數生成了。
1 e = (new Date()).getTime().toString(); 2 3 //定義antitoken 4 function antitoken(e) { 5 var a12 = { 6 utf8: { 7 stringToBytes: function (e) { 8 return a12.bin.stringToBytes(unescape(encodeURIComponent(e))) 9 }, 10 bytesToString: function (e) { 11 return decodeURIComponent(escape(a.bin.bytesToString(e))) 12 } 13 }, 14 bin: { 15 stringToBytes: function (e) { 16 for (var t = [], a = 0; a < e.length; a++) 17 t.push(255 & e.charCodeAt(a)); 18 return t 19 }, 20 bytesToString: function (e) { 21 for (var t = [], a = 0; a < e.length; a++) 22 t.push(String.fromCharCode(e[a])); 23 return t.join("") 24 } 25 } 26 }; 27 var t = null; 28 var n, i, o, s, r; 29 n = { 30 rotl: function (e, t) { 31 return e << t | e >>> 32 - t 32 }, 33 rotr: function (e, t) { 34 return e << 32 - t | e >>> t 35 }, 36 endian: function (e) { 37 if (e.constructor == Number) 38 return 16711935 & n.rotl(e, 8) | 4278255360 & n.rotl(e, 24); 39 for (var t = 0; t < e.length; t++) 40 e[t] = n.endian(e[t]); 41 return e 42 }, 43 randomBytes: function (e) { 44 for (var t = []; e > 0; e--) 45 t.push(Math.floor(256 * Math.random())); 46 return t 47 }, 48 bytesToWords: function (e) { 49 for (var t = [], a = 0, n = 0; a < e.length; a++, 50 n += 8) 51 t[n >>> 5] |= e[a] << 24 - n % 32; 52 return t 53 }, 54 wordsToBytes: function (e) { 55 for (var t = [], a = 0; a < 32 * e.length; a += 8) 56 t.push(e[a >>> 5] >>> 24 - a % 32 & 255); 57 return t 58 }, 59 bytesToHex: function (e) { 60 for (var t = [], a = 0; a < e.length; a++) 61 t.push((e[a] >>> 4).toString(16)), 62 t.push((15 & e[a]).toString(16)); 63 return t.join("") 64 }, 65 hexToBytes: function (e) { 66 for (var t = [], a = 0; a < e.length; a += 2) 67 t.push(parseInt(e.substr(a, 2), 16)); 68 return t 69 }, 70 bytesToBase64: function (e) { 71 for (var t = [], n = 0; n < e.length; n += 3) 72 for (var i = e[n] << 16 | e[n + 1] << 8 | e[n + 2], o = 0; o < 4; o++) 73 8 * n + 6 * o <= 8 * e.length ? t.push(a.charAt(i >>> 6 * (3 - o) & 63)) : t.push("="); 74 return t.join("") 75 }, 76 base64ToBytes: function (e) { 77 e = e.replace(/[^A-Z0-9+\/]/gi, ""); 78 for (var t = [], n = 0, i = 0; n < e.length; i = ++n % 4) 79 0 != i && t.push((a.indexOf(e.charAt(n - 1)) & Math.pow(2, -2 * i + 8) - 1) << 2 * i | a.indexOf(e.charAt(n)) >>> 6 - 2 * i); 80 return t 81 } 82 }, 83 84 85 i = a12.utf8, 86 o = null, // o = a(30) 87 s = a12.bin, 88 (r = function (e, t) { 89 e.constructor == String ? e = t && "binary" === t.encoding ? s.stringToBytes(e) : i.stringToBytes(e) : o(e) ? e = Array.prototype.slice.call(e, 0) : Array.isArray(e) || (e = e.toString()); 90 for (var a = n.bytesToWords(e), l = 8 * e.length, c = 1732584193, d = -271733879, p = -1732584194, u = 271733878, m = 0; m < a.length; m++) 91 a[m] = 16711935 & (a[m] << 8 | a[m] >>> 24) | 4278255360 & (a[m] << 24 | a[m] >>> 8); 92 a[l >>> 5] |= 128 << l % 32, 93 a[14 + (l + 64 >>> 9 << 4)] = l; 94 var f = r._ff 95 , h = r._gg 96 , v = r._hh 97 , g = r._ii; 98 for (m = 0; m < a.length; m += 16) { 99 var y = c 100 , _ = d 101 , b = p 102 , $ = u; 103 d = g(d = g(d = g(d = g(d = v(d = v(d = v(d = v(d = h(d = h(d = h(d = h(d = f(d = f(d = f(d = f(d, p = f(p, u = f(u, c = f(c, d, p, u, a[m + 0], 7, -680876936), d, p, a[m + 1], 12, -389564586), c, d, a[m + 2], 17, 606105819), u, c, a[m + 3], 22, -1044525330), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 4], 7, -176418897), d, p, a[m + 5], 12, 1200080426), c, d, a[m + 6], 17, -1473231341), u, c, a[m + 7], 22, -45705983), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 8], 7, 1770035416), d, p, a[m + 9], 12, -1958414417), c, d, a[m + 10], 17, -42063), u, c, a[m + 11], 22, -1990404162), p = f(p, u = f(u, c = f(c, d, p, u, a[m + 12], 7, 1804603682), d, p, a[m + 13], 12, -40341101), c, d, a[m + 14], 17, -1502002290), u, c, a[m + 15], 22, 1236535329), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 1], 5, -165796510), d, p, a[m + 6], 9, -1069501632), c, d, a[m + 11], 14, 643717713), u, c, a[m + 0], 20, -373897302), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 5], 5, -701558691), d, p, a[m + 10], 9, 38016083), c, d, a[m + 15], 14, -660478335), u, c, a[m + 4], 20, -405537848), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 9], 5, 568446438), d, p, a[m + 14], 9, -1019803690), c, d, a[m + 3], 14, -187363961), u, c, a[m + 8], 20, 1163531501), p = h(p, u = h(u, c = h(c, d, p, u, a[m + 13], 5, -1444681467), d, p, a[m + 2], 9, -51403784), c, d, a[m + 7], 14, 1735328473), u, c, a[m + 12], 20, -1926607734), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 5], 4, -378558), d, p, a[m + 8], 11, -2022574463), c, d, a[m + 11], 16, 1839030562), u, c, a[m + 14], 23, -35309556), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 1], 4, -1530992060), d, p, a[m + 4], 11, 1272893353), c, d, a[m + 7], 16, -155497632), u, c, a[m + 10], 23, -1094730640), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 13], 4, 681279174), d, p, a[m + 0], 11, -358537222), c, d, a[m + 3], 16, -722521979), u, c, a[m + 6], 23, 76029189), p = v(p, u = v(u, c = v(c, d, p, u, a[m + 9], 4, -640364487), d, p, a[m + 12], 11, -421815835), c, d, a[m + 15], 16, 530742520), u, c, a[m + 2], 23, -995338651), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 0], 6, -198630844), d, p, a[m + 7], 10, 1126891415), c, d, a[m + 14], 15, -1416354905), u, c, a[m + 5], 21, -57434055), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 12], 6, 1700485571), d, p, a[m + 3], 10, -1894986606), c, d, a[m + 10], 15, -1051523), u, c, a[m + 1], 21, -2054922799), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 8], 6, 1873313359), d, p, a[m + 15], 10, -30611744), c, d, a[m + 6], 15, -1560198380), u, c, a[m + 13], 21, 1309151649), p = g(p, u = g(u, c = g(c, d, p, u, a[m + 4], 6, -145523070), d, p, a[m + 11], 10, -1120210379), c, d, a[m + 2], 15, 718787259), u, c, a[m + 9], 21, -343485551), 104 c = c + y >>> 0, 105 d = d + _ >>> 0, 106 p = p + b >>> 0, 107 u = u + $ >>> 0 108 } 109 return n.endian([c, d, p, u]) 110 } 111 )._ff = function (e, t, a, n, i, o, s) { 112 var r = e + (t & a | ~t & n) + (i >>> 0) + s; 113 return (r << o | r >>> 32 - o) + t 114 } 115 , 116 r._gg = function (e, t, a, n, i, o, s) { 117 var r = e + (t & n | a & ~n) + (i >>> 0) + s; 118 return (r << o | r >>> 32 - o) + t 119 } 120 , 121 r._hh = function (e, t, a, n, i, o, s) { 122 var r = e + (t ^ a ^ n) + (i >>> 0) + s; 123 return (r << o | r >>> 32 - o) + t 124 } 125 , 126 r._ii = function (e, t, a, n, i, o, s) { 127 var r = e + (a ^ (t | ~n)) + (i >>> 0) + s; 128 return (r << o | r >>> 32 - o) + t 129 } 130 , 131 r._blocksize = 16, 132 r._digestsize = 16; 133 134 var a = n.wordsToBytes(r(e, t)); 135 return t && t.asBytes ? a : t && t.asString ? s.bytesToString(a) : n.bytesToHex(a); 136 } 137 ; 138 139 console.log(antitoken(e));
OK~成功拿到了這個antitoken。那麼是否是就能夠去獲取酒店的評論信息啦?答案是否認的,由於這個antitoken他是全局變量。
這裏有個最簡單的方法取拿數據那就是打開你的fiddler直接拿到headers以及cookies,而後直接取請求便可,不須要你經過代碼取獲取cookies,反正我經過selenuim以及requests獲取到的cookies都是不全的,有一個關鍵信息始終是獲取不到的那就是它。
這個sessionID,如今你知道session與cookie的區別了嗎?
最後這時我拿到的數據
總結一下:
此次獲取同程的這個antitoken,對於如今的我來講挺難的,關鍵是對於前端JS如何進行混淆,以及如何獲取到想要的函數都仍是不懂呀,分析的思路主要是原做者的思路,我只能跟着他的腳步一步一步的作。這就是經驗上的差距吧。之後須要多加練習相似的具備反爬措施,且token是通過加密的網站。爬這種網站真的收穫挺大的。
還有這篇分析JS的文章是我按照原做者的步驟一步一步的執行寫出來的。你們能夠取知乎上看看原做者寫的。他的思路更加清晰明瞭。
傳送門: