知乎改版api接口之scrapy自動登錄

最近使用scrapy模擬登錄知乎,發現全部接口都發生變化了,包括驗證碼也發生了很大變化,經過抓包分析,記錄下改版後的知乎模擬登錄,廢話很少說,直接上代碼,親測有效python

# -*- coding: utf-8 -*-
from PIL import Image
from scrapy.exceptions import CloseSpider
import scrapy
import json
import base64


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    handle_httpstatus_list = [401, 403]
    client_id = 'c3cef7c66a1843f8b3a9e6a1e3160e20'    #固定不變
    signature = 'b858d0c8b1f2e86c6cb0d93d4055963bcf1121ec'    #抓包獲取
    timestamp = '1519567594106'       #抓包獲取
    headers = {
        "HOST": "www.zhihu.com",
        "Referer": "https://www.zhihu.com/signup?next=%2F",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36Name",
        "authorization": "oauth c3cef7c66a1843f8b3a9e6a1e3160e20",
    }

    def parse(self, response):
        pass

    def start_requests(self):
        '''
        獲取登錄頁面,set_cookie
        :return:
        '''
        return [scrapy.Request(url='https://www.zhihu.com/signup?next=%2F',
                               headers=self.headers,
                               method="GET",
                               meta={'cookiejar':1},
                               callback=self.post_captchareq,
                               dont_filter=True,
                               )]

    def post_captchareq(self, response):
        '''
        發送獲取驗證碼請求
        :param response:
        :return:
        '''
        return [scrapy.Request(
            url='https://www.zhihu.com/api/v3/oauth/captcha?lang=en',
            headers=self.headers,
            meta={'cookiejar': response.meta['cookiejar']},
            dont_filter=True,
            callback=self.deal_captchareq,
        )]

    def deal_captchareq(self, response):
        '''
        判斷是否須要驗證碼
        :param response:
        :return:
        '''
        json_res = json.loads(response.text)
        post_data = {
            "client_id": self.client_id,
            "grant_type":"password",
            "timestamp": self.timestamp,
            "source": "com.zhihu.web",
            "signature": self.signature,
            "username": '+86你的手機號碼',
            "password":'密碼',
            "captcha": '',
            "lang":"en",
            "ref_source":"homepage",
            "utm_source":""
        }
        if json_res.get("show_captcha", None):
            return [
                scrapy.Request(
                    url='https://www.zhihu.com/api/v3/oauth/captcha?lang=en',
                    headers=self.headers,
                    method='PUT',
                    meta={'cookiejar': response.meta['cookiejar'],
                            'post_data':post_data},
                    callback=self.get_captchaimg
                )
            ]
        return [
            scrapy.FormRequest(
                url="https://www.zhihu.com/api/v3/oauth/sign_in",
                formdata=post_data,
                method="POST",
                headers=self.headers,
                meta={'cookiejar': response.meta['cookiejar']},
                callback=self.check_login,
                dont_filter=True,
            )
        ]

    def get_captchaimg(self, response):
        '''
        獲取驗證碼圖片流數據,手動輸入驗證碼
        :param response:
        :return:
        '''
        post_data = response.meta['post_data']
        try:
            json_img = json.loads(response.text)
            bs64_img = json_img["img_base64"]
            bs64_img = bs64_img.encode('utf-8')
            img_steam = base64.b64decode(bs64_img)
            with open("zhihucaptcha.jpg", 'wb') as f:
                f.write(img_steam)
            img = Image.open("zhihucaptcha.jpg")
            img.show()
            input_captcha = input("請輸入圖中驗證碼:").strip()
            post_data['captcha'] = input_captcha
            img.close()
            post_code = {
                "input_text":input_captcha,
            }
            return [
                scrapy.FormRequest(
                    url="https://www.zhihu.com/api/v3/oauth/captcha?lang=en",
                    formdata=post_code,
                    headers=self.headers,
                    method='POST',
                    meta={'cookiejar': response.meta['cookiejar'],
                          'post_data':post_data},
                    callback=self.post_captcha,
                    dont_filter=True,
                )
            ]
        except Exception as e:
            raise CloseSpider('獲取驗證碼發生錯誤:{error}'.format(error=e))

    def post_captcha(self, response):
        '''
        發送用戶認證信息登錄
        :param response:
        :return:
        '''
        post_data = response.meta.get('post_data')
        if json.loads(response.text).get('success'):
            return [
                scrapy.FormRequest(
                    url="https://www.zhihu.com/api/v3/oauth/sign_in",
                    formdata=post_data,
                    headers=self.headers,
                    method='POST',
                    meta={'cookiejar': response.meta['cookiejar']},
                    callback=self.check_login,
                    dont_filter=True,
                )
            ]
        else:
            raise CloseSpider('驗證碼不正確')

    def check_login(self, response):
        #驗證是否登錄成功
        print('==============>',response.text)
        print(response.status)
        if response.status == 201:
            self.logger.info("登錄成功!")
        else:
            raise CloseSpider('登錄信息有誤!')  

  其中,其它參數如client_id, oauth等都是固定的,signature與timestamp是隨着時間戳變化的,它是用於驗證合法用戶的token,實質也是一段客戶端的js運行生成的,這裏爲了方便,直接經過抓包獲取某個固定時間戳對應的signatureweb

 

 

 先在pc端輸入錯誤帳戶信息,抓包獲取timestamp與signature,替換對應的便可json

相關文章
相關標籤/搜索