1. requestjson
登陸知乎須要向服務器提交的信息有:服務器
①headerscookie
②_xsrfsession
③captchadom
須要經過解析頁面得到_xsrf和captcha(驗證碼)scrapy
而有關captcha的獲取則必需要用session的方式得到, 目的是爲了使_xsrf和驗證碼信息一致ide
(由於session中能夠保存cookie, 保證數據的一致性)代碼以下:函數
1 import re 2 import time 3 import os.path 4 import requests 5 6 try: 7 import cookielib 8 except: 9 import http.cookiejar as cookielib 10 11 from PIL import Image 12 13 session = requests.session() 14 session.cookies = cookielib.LWPCookieJar(filename="cookies")# 登錄成功後將cookie保存到文件中, 以後登錄就能夠直接加載cookie,而不須要輸入帳號和密碼(session機制) 15 try: 16 session.cookies.load(ignore_discard=True) 17 except: 18 print("cookies未能加載") 19 20 agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' 21 # agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36' 22 23 # agent = "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/57.0" 24 headers = { 25 "Host": "www.zhihu.com", 26 "Referer": "https://www.zhihu.com/", 27 "User-Agent": agent, 28 } 29 30 31 def get_xsrf(): 32 response = session.get("https://www.zhihu.com/", headers= headers) 33 match_ojb = re.search('name="_xsrf" value="(.*)"', response.text) 34 print(response.text) 35 if match_ojb: 36 return match_ojb.group(1) 37 else: 38 print("error") 39 40 41 def get_captcha(): 42 t = str(int(time.time() * 1000)) 43 captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login" 44 r = session.get(captcha_url, headers=headers) 45 with open('captcha.jpg', 'wb') as f: 46 f.write(r.content) 47 f.close() 48 try: 49 im = Image.open('captcha.jpg') 50 im.show() 51 im.close() 52 except: 53 print('請到 % s找到captcha.jpg手動輸入'.format(os.path.abspath('captcha.jpg'))) 54 captcha = input("please input the captcha\n") 55 return captcha 56 57 58 def is_login(): 59 # 經過用戶我的中心驗證是否登錄成功 60 check_url = "https://www.zhihu.com/settings/profile" 61 response = session.get(check_url, headers=headers, allow_redirects=False) 62 if response.status_code != 200: 63 return False 64 else: 65 return True 66 67 68 def login(account, password): 69 # 知乎登錄 70 _xsrf = get_xsrf() 71 if '@' in account: 72 print("郵箱登錄") 73 post_url = "https://www.zhihu.com/login/email" 74 post_data = { 75 "_xsrf": _xsrf, 76 "password": password, 77 "email": account, 78 } 79 else: 80 if re.match('^1\d{10}', account): 81 print("手機登錄") 82 post_url = "https://www.zhihu.com/login/phone_num" 83 post_data = { 84 "_xsrf": get_xsrf(), 85 "password": password, 86 "phone_num": account, 87 } 88 # 不須要驗證碼直接登陸成功 89 response = session.post(post_url, data=post_data, headers=header) 90 login_code = response.json() 91 92 if login_code['r'] == 1: 93 print("不輸入驗證碼登錄失敗") 94 #當不輸入驗證碼登陸失敗時, 獲取驗證碼, 從新登陸 95 post_data["captcha"] = get_captcha() 96 response = session.post(post_url, data=post_data, headers=header) 97 login_code = response.json() 98 print(login_code['msg']) 99 100 session.cookies.save() 101 102 if __name__ == '__main__': 103 if is_login(): 104 print("已經登錄!") 105 else: 106 login(account, password)
2. scrapypost
若是在scrapy中直接調用上文中的get_captcha()函數來得到驗證碼, 而後提交是沒法登錄成功的, 緣由是數據不一致,也就是說獲取的_xsrf和驗證碼一塊兒提交到服務器是不匹配的.ui
scrapy機制是默認保存cookie的,因此能夠經過兩個request請求來將獲得的信息保存在默認的cookie中,代碼以下:
1 # -*- coding: utf-8 -*- 2 import re 3 import json 4 import datetime 5 6 try: 7 import urlparse as parse 8 except: 9 from urllib import parse 10 11 import scrapy 12 13 14 class ZhihuSpider(scrapy.Spider): 15 name = "zhihu" 16 allowed_domains = ["www.zhihu.com"] 17 start_urls = ['https://www.zhihu.com/'] 18 19 headers = { 20 "HOST": "www.zhihu.com", 21 "Referer": "https://www.zhizhu.com", 22 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 23 } 24 25 def start_requests(self): 26 return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] 27 #獲取_xsrf 28 def login(self, response): 29 response_text = response.text 30 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) 31 xsrf = '' 32 if match_obj: 33 xsrf = (match_obj.group(1)) 34 35 if xsrf: 36 post_url = "https://www.zhihu.com/login/phone_num" 37 post_data = { 38 "_xsrf": xsrf, 39 "phone_num": "", 40 "password": "", 41 "captcha": "" 42 } 43 44 import time 45 t = str(int(time.time() * 1000)) 46 captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) 47 yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha) 48 49 #獲取驗證碼 50 def login_after_captcha(self, response): 51 with open("captcha.jpg", "wb") as f: 52 f.write(response.body) 53 f.close() 54 55 from PIL import Image 56 try: 57 im = Image.open('captcha.jpg') 58 im.show() 59 im.close() 60 except: 61 pass 62 63 captcha = input("輸入驗證碼\n>") 64 65 post_data = response.meta.get("post_data", {}) 66 post_url = "https://www.zhihu.com/login/phone_num" 67 post_data["captcha"] = captcha 68 return [scrapy.FormRequest( 69 url=post_url, 70 formdata=post_data, 71 headers=self.headers, 72 callback=self.check_login 73 )] 74 75 def check_login(self, response): 76 #驗證服務器的返回數據判斷是否成功 77 text_json = json.loads(response.text) 78 if "msg" in text_json and text_json["msg"] == "登陸成功": 79 for url in self.start_urls: 80 yield scrapy.Request(url, dont_filter=True, headers=self.headers)