requests_模擬登陸知乎

時間 2019-11-14

原文原文鏈接

如何登陸知乎？python

　　首先要分析，進行知乎驗證的時候，知乎服務器須要咱們提交什麼數據，提交的地址。先進行幾回登陸嘗試，經過瀏覽器中network中查看數據流得知，模擬登陸知乎須要提供5個數據，分別是_xsrf、password、captcha_type、captcha、phone_num，這個是手機號碼進行驗證登陸，提交地址 https://www.zhihu.com/login/phone_num ，那，開始獲取這些個數據，其中那個圖片驗證碼是最可貴，經過第三方插件zheye進行驗證碼解析，就能夠解決問題json

如何作？瀏覽器

　　1. 請求知乎登陸頁面，並解析得到_xsrf值服務器

　　2. 請求驗證碼url得到驗證碼圖片，交個zheye解析出結果並拼接出知乎想要的captcha值session

　　3. 從知乎驗證提交表單中獲取captcha_type的值，並獲取表單提交地址app

　　4. captcha_type、password 對應用戶名和密碼dom

　　5. 提交數據，獲取返回的response，經過判斷response數據獲取是否登陸成功post

#!/usr/bin/python3

__author__ = 'beimenchuixue'
__blog__ = 'http://www.cnblogs.com/2bjiujiu/'

import requests
import re
import json
from time import sleep
from random import choice, randint
from zhihu_yanzheng.zheye import zheye

headers = {
    'pragma': 'no-cache',
    'Referer': 'https://www.zhihu.com/signin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
# 生成全局的session值
session = requests.session()


def random_sleep():
    # 隨機暫停，模擬人訪問
    while choice([0, 1]):
        sleep(choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]))


def get_xsrf():
    """
    獲取知乎頁面_xsrf的值
    :param url:
    :return:
    """
    zhihu_url = 'https://www.zhihu.com/signin'
    # 創建映射，增長程序的可閱讀性
    xsrf_value = 1
    while True:
        
        random_sleep()
        
        resp = session.get(zhihu_url, headers=headers)
        # 檢查是否返回正確
        resp.raise_for_status()
        # 設置網頁編碼
        resp.encoding = resp.apparent_encoding
        # 獲取_xsrfde值
        _xsrf = re.match(r'.*name="_xsrf" value="(.*?)"', re.sub(r'[\r\t\n]', '', resp.text)).group(xsrf_value)
        # 保證有_xsrf數據
        if _xsrf:
            break
    return _xsrf


def get_img_data():
    """
    獲取圖片而且得到圖片的驗證碼數據
    :return:
    """
    while True:
        random_int = str(randint(1, pow(10, 13)))
        # 獲取圖片的值
        img_url = 'https://www.zhihu.com/captcha.gif?r=%s&type=login&lang=cn' % random_int
        
        random_sleep()
        # 獲取圖片
        img_resp = session.get(img_url, headers=headers)
        with open('zhihu_img.jpg', 'wb') as f:
            f.write(img_resp.content)
        # 解析圖片
        z = zheye()
        img_yanzhe = z.Recognize('zhihu_img.jpg')
        # 把得到的座標按x進行排序 [(48.647850377664284, 315.97586850515023), (49.944977855563351, 146.27730894630022)]
        img_yanzhe.sort(key=lambda x: x[1])
        # 知乎提交的位置數據和zheye解析的數據位置相反，置換成知乎要求的數據
        img_data = []
        for y, x in img_yanzhe:
            # zheye中圖片爲400*88像數，知乎要求爲200*44，全部每一個值都要除以2
            img_data.append((x / 2, y / 2))
        # 有數據表示解析成功，沒數據從新請求數據再次解析
        if img_data:
            break
    return img_data


def get_captcha_data(img_data):
    """經過字符串格式化獲得知乎想要的captcha值"""
    # captcha:{"img_size":[200,44],"input_points":[[120.375,34],[160.375,36]]}
    # first, second, third分別對應第1、第2、第三值，x，y 對應其中x，y座標值
    first, second, third, x, y = 0, 1, 2, 0, 1
    if len(img_data) == 1:
        captcha = '{"img_size":[200,44],"input_points":[[%.2f,%.2f]]}' \
                  % (img_data[first][x], img_data[first][y])
    elif len(img_data) == 2:
        captcha = '{"img_size":[200,44],"input_points":[[%.2f,%.2f],[%.2f,%.2f]]}' \
                  % (img_data[first][x], img_data[first][y], img_data[second][x], img_data[second][y])
    elif len(img_data) == 2:
        captcha = '{"img_size":[200,44],"input_points":[[%.2f,%.2f],[%.2f,%.2f],[%.2f,%.2f]]}' \
                  % (
                  img_data[first][x], img_data[first][y], img_data[second][x], img_data[second][y], img_data[third][x],
                  img_data[third][y])
    return captcha


def get_form_data():
    """獲取知乎想要的表單數據"""
    _xsrf = get_xsrf()
    img_data = get_img_data()
    captcha = get_captcha_data(img_data)
    form_data = {
        '_xsrf': _xsrf,
        'password': 'password',
        'captcha_type': 'cn',
        'captcha': captcha,
        'phone_num': 'phone_num',
    }
    return form_data


def go_login(post_url):
    """提交表單並獲取返回值"""
    from_data = get_form_data()
    
    random_sleep()
    # 提交驗證信息，不容許重定向
    resp_yan = session.post(post_url, headers=headers, data=from_data, allow_redirects=False)
    # 獲取返回的值
    result = resp_yan.text
    return json.loads(result)


if __name__ == '__main__':
    post_url = 'https://www.zhihu.com/login/phone_num'
    result = go_login(post_url)
    print(result)

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。