js代碼已失效html
近來對爬蟲比較感興趣,而後本渣的英文的水平就是個渣渣,因此寫了百度翻譯網頁版的爬蟲。python
本人技術有限大神勿噴,若有不當之處歡迎指出。json
爬蟲環境爲 python3.x 如想運行在python2.7,請自行修改。python3.x
須要安裝第三方的套件包:requests、execjsapi
建議直接以下安裝python2.7
pip3 install requests pip3 install pyexecjs
廢話很少說貼代碼。url
#!/usr/bin/env python3 # encoding: utf-8 # 源碼基於 csdn的HONGQUAN的代碼修改而來
import re import requests import execjs import urllib import json class Baidufanyi(object): def __init__(self): self.gtk = None self.token = None self.header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36", "Cookie": "'locale=zh;BAIDUID=DB7712AF47C959AC2BF32ECFF8BD8F71:FG=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1;'" } self.html = requests.get('http://fanyi.baidu.com', headers=self.header) self.html.encoding = 'utf-8' self.signCode = 'function a(r,o){for(var t=0;t<o.length-2;t+=3){var a=o.charAt(t+2);a=a>="a"?a.charCodeAt(0)-87:Number(a),a="+"===o.charAt(t+1)?r>>>a:r<<a,r="+"===o.charAt(t)?r+a&4294967295:r^a}return r}var C=null;var hash=function(r,_gtk){var o=r.length;o>30&&(r=""+r.substr(0,10)+r.substr(Math.floor(o/2)-5,10)+r.substr(-10,10));var t=void 0,t=null!==C?C:(C=_gtk||"")||"";for(var e=t.split("."),h=Number(e[0])||0,i=Number(e[1])||0,d=[],f=0,g=0;g<r.length;g++){var m=r.charCodeAt(g);128>m?d[f++]=m:(2048>m?d[f++]=m>>6|192:(55296===(64512&m)&&g+1<r.length&&56320===(64512&r.charCodeAt(g+1))?(m=65536+((1023&m)<<10)+(1023&r.charCodeAt(++g)),d[f++]=m>>18|240,d[f++]=m>>12&63|128):d[f++]=m>>12|224,d[f++]=m>>6&63|128),d[f++]=63&m|128)}for(var S=h,u="+-a^+6",l="+-3^+b+-f",s=0;s<d.length;s++)S+=d[s],S=a(S,u);return S=a(S,l),S^=i,0>S&&(S=(2147483647&S)+2147483648),S%=1e6,S.toString()+"."+(S^h)}' try: # 獲取 gtk match = re.search("window.gtk = '(.*?)';", self.html.text, re.S).group(1) self.gtk = match # print('gtk = ', gtk) # 正則匹配 token match = re.search("token: '(.*?)'", self.html.text, re.S).group(1) self.token = match # print('token = ', token) except Exception: print('沒有獲取到「gtk」,或者「token」') return def shuru(self, source): """判斷第一個字符是不是中文""" if '\u4e00' <= source[0] <= '\u9fff': en = False else: en = True fromLanguage = 'en' if en else 'zh' toLanguage = 'zh' if en else 'en' return fromLanguage, toLanguage def sign(self, source): """計算sign""" sign = execjs.compile(self.signCode).call('hash', source, self.gtk) return sign def fanyi(self, fromLanguage, toLanguage, source, sign): v2transapi = 'http://fanyi.baidu.com/v2transapi?from=%s&to=%s&query=%s&transtype=translang&simple_means_flag=3&sign=%s&token=%s' % ( fromLanguage, toLanguage, urllib.parse.quote(source), sign, self.token) translate_result = requests.get(v2transapi, headers=self.header) return translate_result
if __name__ == '__main__':
Baidu = Baidufanyi()
print('請輸入要翻譯的str(按 123 退出翻譯):')
while True:
source = input()
if '123' == source:
break
elif '' == source:
continue
sign = Baidu.sign(source)
shuru = Baidu.shuru(source)
response = Baidu.fanyi(shuru[0], shuru[1], source, sign)
print(json.loads(response.text)['trans_result']['data'][0]['dst'])spa