參考 https://github.com/zkqiang/Zhihu-Loginpython
# -*- coding: utf-8 -*- import scrapy import time import re import base64 import hmac import hashlib import json import matplotlib.pyplot as plt from PIL import Image class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] login_url = 'https://www.zhihu.com/signup' login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in' login_data = { 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20', 'grant_type': 'password', 'source': 'com.zhihu.web', 'username': "+86xxxxxx", 'password': "xxxxxx", # 傳入'cn'是倒立漢字驗證碼 'lang': 'en', 'ref_source': 'homepage' } headers = { 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', 'Referer': 'https://www.zhihu.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/69.0.3497.100 Safari/537.36' } def start_requests(self): if self.login_data["lang"] == 'cn': api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=cn' else: api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en' yield scrapy.Request(url=api, headers=self.headers, callback=self._is_need_captcha) def _is_need_captcha(self, response): show_captcha = re.search(r'true', response.text) if show_captcha: yield scrapy.Request(url=response.url, headers=self.headers, method="PUT", callback=self._get_captcha) else: timestamp = str(int(time.time() * 1000)) self.login_data.update({ 'captcha': "", 'timestamp': timestamp, 'signature': self._get_signature(timestamp) }) yield scrapy.FormRequest( url=self.login_api, formdata=self.login_data, headers=self.headers, callback=self.check_login ) def _get_captcha(self, response): json_data = json.loads(response.text) img_base64 = json_data['img_base64'].replace(r'\n', '') with open('./captcha.jpg', 'wb') as f: f.write(base64.b64decode(img_base64)) img = Image.open('./captcha.jpg') if self.login_data["lang"] == 'cn': plt.imshow(img) print('點擊全部倒立的漢字,按回車提交') points = plt.ginput(7) capt = json.dumps({'img_size': [200, 44], 'input_points': [[i[0] / 2, i[1] / 2] for i in points]}) else: img.show() capt = input('請輸入圖片裏的驗證碼:') # 這裏必須先把參數 POST 驗證碼接口 yield scrapy.FormRequest(url=response.url, formdata={'input_text': capt}, headers=self.headers, callback=self.captcha_login, meta={"captcha":capt} ) def captcha_login(self, response): timestamp = str(int(time.time() * 1000)) self.login_data.update({ 'captcha': response.meta['captcha'], 'timestamp': timestamp, 'signature': self._get_signature(timestamp) }) yield scrapy.FormRequest( url=self.login_api, formdata=self.login_data, headers=self.headers, callback=self.check_login ) def check_login(self, response): yield scrapy.Request( url=self.login_url, headers=self.headers, callback=self.parse ) def _get_signature(self, timestamp): """ 經過 Hmac 算法計算返回簽名 實際是幾個固定字符串加時間戳 :param timestamp: 時間戳 :return: 簽名 """ ha = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=hashlib.sha1) grant_type = self.login_data['grant_type'] client_id = self.login_data['client_id'] source = self.login_data['source'] ha.update(bytes((grant_type + client_id + source + timestamp), 'utf-8')) return ha.hexdigest() def parse(self, response): print(response.text)