模擬知乎登錄(requests和scrapy)

1. requestjson

  登陸知乎須要向服務器提交的信息有:服務器

    ①headerscookie

    ②_xsrfsession

    ③captchadom

  須要經過解析頁面得到_xsrf和captcha(驗證碼)scrapy

  而有關captcha的獲取則必需要用session的方式得到, 目的是爲了使_xsrf和驗證碼信息一致ide

  (由於session中能夠保存cookie, 保證數據的一致性)代碼以下:函數

  1 import re
  2 import time
  3 import os.path
  4 import requests
  5 
  6 try:
  7     import cookielib
  8 except:
  9     import http.cookiejar as cookielib
 10 
 11 from PIL import Image
 12 
 13 session = requests.session()
 14 session.cookies = cookielib.LWPCookieJar(filename="cookies")# 登錄成功後將cookie保存到文件中, 以後登錄就能夠直接加載cookie,而不須要輸入帳號和密碼(session機制)
 15 try:
 16     session.cookies.load(ignore_discard=True)
 17 except:
 18     print("cookies未能加載")
 19 
 20 agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
 21 # agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'
 22 
 23 # agent = "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/57.0"
 24 headers = {
 25     "Host": "www.zhihu.com",
 26     "Referer": "https://www.zhihu.com/",
 27     "User-Agent": agent,
 28 }
 29 
 30 
 31 def get_xsrf():
 32     response = session.get("https://www.zhihu.com/", headers= headers)
 33     match_ojb = re.search('name="_xsrf" value="(.*)"', response.text)
 34     print(response.text)
 35     if match_ojb:
 36         return match_ojb.group(1)
 37     else:
 38         print("error")
 39 
 40 
 41 def get_captcha():
 42     t = str(int(time.time() * 1000))
 43     captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
 44     r = session.get(captcha_url, headers=headers)
 45     with open('captcha.jpg', 'wb') as f:
 46         f.write(r.content)
 47         f.close()
 48     try:
 49         im = Image.open('captcha.jpg')
 50         im.show()
 51         im.close()
 52     except:
 53         print('請到 % s找到captcha.jpg手動輸入'.format(os.path.abspath('captcha.jpg')))
 54     captcha = input("please input the captcha\n")
 55     return captcha
 56 
 57 
 58 def is_login():
 59     # 經過用戶我的中心驗證是否登錄成功
 60     check_url = "https://www.zhihu.com/settings/profile"
 61     response = session.get(check_url, headers=headers, allow_redirects=False)
 62     if response.status_code != 200:
 63         return False
 64     else:
 65         return True
 66 
 67 
 68 def login(account, password):
 69     # 知乎登錄
 70     _xsrf = get_xsrf()
 71     if '@' in account:
 72         print("郵箱登錄")
 73         post_url = "https://www.zhihu.com/login/email"
 74         post_data = {
 75             "_xsrf": _xsrf,
 76             "password": password,
 77             "email": account,
 78         }
 79     else:
 80         if re.match('^1\d{10}', account):
 81             print("手機登錄")
 82         post_url = "https://www.zhihu.com/login/phone_num"
 83         post_data = {
 84             "_xsrf": get_xsrf(),
 85             "password": password,
 86             "phone_num": account,
 87         }
 88     # 不須要驗證碼直接登陸成功
 89     response = session.post(post_url, data=post_data, headers=header)
 90     login_code = response.json()
 91 
 92     if login_code['r'] == 1:
 93         print("不輸入驗證碼登錄失敗")
 94         #當不輸入驗證碼登陸失敗時, 獲取驗證碼, 從新登陸
 95         post_data["captcha"] = get_captcha()
 96         response = session.post(post_url, data=post_data, headers=header)
 97         login_code = response.json()
 98         print(login_code['msg'])
 99 
100     session.cookies.save()
101 
102 if __name__ == '__main__':
103     if is_login():
104         print("已經登錄!")
105     else:
106         login(account, password)

2. scrapypost

  若是在scrapy中直接調用上文中的get_captcha()函數來得到驗證碼, 而後提交是沒法登錄成功的, 緣由是數據不一致,也就是說獲取的_xsrf和驗證碼一塊兒提交到服務器是不匹配的.ui

  scrapy機制是默認保存cookie的,因此能夠經過兩個request請求來將獲得的信息保存在默認的cookie中,代碼以下:

  

 1 # -*- coding: utf-8 -*-
 2 import re
 3 import json
 4 import datetime
 5 
 6 try:
 7     import urlparse as parse
 8 except:
 9     from urllib import parse
10 
11 import scrapy
12 
13 
14 class ZhihuSpider(scrapy.Spider):
15     name = "zhihu"
16     allowed_domains = ["www.zhihu.com"]
17     start_urls = ['https://www.zhihu.com/']
18 
19     headers = {
20         "HOST": "www.zhihu.com",
21         "Referer": "https://www.zhizhu.com",
22         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
23     }
24 
25     def start_requests(self):
26         return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]
27     #獲取_xsrf
28     def login(self, response):
29         response_text = response.text
30         match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)
31         xsrf = ''
32         if match_obj:
33             xsrf = (match_obj.group(1))
34 
35         if xsrf:
36             post_url = "https://www.zhihu.com/login/phone_num"
37             post_data = {
38                 "_xsrf": xsrf,
39                 "phone_num": "",
40                 "password": "",
41                 "captcha": ""
42             }
43 
44             import time
45             t = str(int(time.time() * 1000))
46             captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
47             yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha)
48 
49     #獲取驗證碼
50     def login_after_captcha(self, response):
51         with open("captcha.jpg", "wb") as f:
52             f.write(response.body)
53             f.close()
54 
55         from PIL import Image
56         try:
57             im = Image.open('captcha.jpg')
58             im.show()
59             im.close()
60         except:
61             pass
62 
63         captcha = input("輸入驗證碼\n>")
64 
65         post_data = response.meta.get("post_data", {})
66         post_url = "https://www.zhihu.com/login/phone_num"
67         post_data["captcha"] = captcha
68         return [scrapy.FormRequest(
69             url=post_url,
70             formdata=post_data,
71             headers=self.headers,
72             callback=self.check_login
73         )]
74 
75     def check_login(self, response):
76         #驗證服務器的返回數據判斷是否成功
77         text_json = json.loads(response.text)
78         if "msg" in text_json and text_json["msg"] == "登陸成功":
79             for url in self.start_urls:
80                 yield scrapy.Request(url, dont_filter=True, headers=self.headers)
相關文章
相關標籤/搜索