最近使用scrapy模擬登錄知乎,發現全部接口都發生變化了,包括驗證碼也發生了很大變化,經過抓包分析,記錄下改版後的知乎模擬登錄,廢話很少說,直接上代碼,親測有效python
# -*- coding: utf-8 -*- from PIL import Image from scrapy.exceptions import CloseSpider import scrapy import json import base64 class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] handle_httpstatus_list = [401, 403] client_id = 'c3cef7c66a1843f8b3a9e6a1e3160e20' #固定不變 signature = 'b858d0c8b1f2e86c6cb0d93d4055963bcf1121ec' #抓包獲取 timestamp = '1519567594106' #抓包獲取 headers = { "HOST": "www.zhihu.com", "Referer": "https://www.zhihu.com/signup?next=%2F", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name", "authorization": "oauth c3cef7c66a1843f8b3a9e6a1e3160e20", } def parse(self, response): pass def start_requests(self): ''' 獲取登錄頁面,set_cookie :return: ''' return [scrapy.Request(url='https://www.zhihu.com/signup?next=%2F', headers=self.headers, method="GET", meta={'cookiejar':1}, callback=self.post_captchareq, dont_filter=True, )] def post_captchareq(self, response): ''' 發送獲取驗證碼請求 :param response: :return: ''' return [scrapy.Request( url='https://www.zhihu.com/api/v3/oauth/captcha?lang=en', headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, dont_filter=True, callback=self.deal_captchareq, )] def deal_captchareq(self, response): ''' 判斷是否須要驗證碼 :param response: :return: ''' json_res = json.loads(response.text) post_data = { "client_id": self.client_id, "grant_type":"password", "timestamp": self.timestamp, "source": "com.zhihu.web", "signature": self.signature, "username": '+86你的手機號碼', "password":'密碼', "captcha": '', "lang":"en", "ref_source":"homepage", "utm_source":"" } if json_res.get("show_captcha", None): return [ scrapy.Request( url='https://www.zhihu.com/api/v3/oauth/captcha?lang=en', headers=self.headers, method='PUT', meta={'cookiejar': response.meta['cookiejar'], 'post_data':post_data}, callback=self.get_captchaimg ) ] return [ scrapy.FormRequest( url="https://www.zhihu.com/api/v3/oauth/sign_in", formdata=post_data, method="POST", headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, callback=self.check_login, dont_filter=True, ) ] def get_captchaimg(self, response): ''' 獲取驗證碼圖片流數據,手動輸入驗證碼 :param response: :return: ''' post_data = response.meta['post_data'] try: json_img = json.loads(response.text) bs64_img = json_img["img_base64"] bs64_img = bs64_img.encode('utf-8') img_steam = base64.b64decode(bs64_img) with open("zhihucaptcha.jpg", 'wb') as f: f.write(img_steam) img = Image.open("zhihucaptcha.jpg") img.show() input_captcha = input("請輸入圖中驗證碼:").strip() post_data['captcha'] = input_captcha img.close() post_code = { "input_text":input_captcha, } return [ scrapy.FormRequest( url="https://www.zhihu.com/api/v3/oauth/captcha?lang=en", formdata=post_code, headers=self.headers, method='POST', meta={'cookiejar': response.meta['cookiejar'], 'post_data':post_data}, callback=self.post_captcha, dont_filter=True, ) ] except Exception as e: raise CloseSpider('獲取驗證碼發生錯誤:{error}'.format(error=e)) def post_captcha(self, response): ''' 發送用戶認證信息登錄 :param response: :return: ''' post_data = response.meta.get('post_data') if json.loads(response.text).get('success'): return [ scrapy.FormRequest( url="https://www.zhihu.com/api/v3/oauth/sign_in", formdata=post_data, headers=self.headers, method='POST', meta={'cookiejar': response.meta['cookiejar']}, callback=self.check_login, dont_filter=True, ) ] else: raise CloseSpider('驗證碼不正確') def check_login(self, response): #驗證是否登錄成功 print('==============>',response.text) print(response.status) if response.status == 201: self.logger.info("登錄成功!") else: raise CloseSpider('登錄信息有誤!')
其中,其它參數如client_id, oauth等都是固定的,signature與timestamp是隨着時間戳變化的,它是用於驗證合法用戶的token,實質也是一段客戶端的js運行生成的,這裏爲了方便,直接經過抓包獲取某個固定時間戳對應的signatureweb
先在pc端輸入錯誤帳戶信息,抓包獲取timestamp與signature,替換對應的便可json