以前介紹過經過requests的session 會話模擬登錄;必須是session,涉及到驗證碼和xsrf的
寫入cookie驗證的問題;在scrapy中不需擔憂此問題,由於Request會保證這是一個會話,而且自動傳遞cookies
原理想通,由於驗證碼識別的問題,這裏先使用cookie模擬登錄html
1 # -*- coding: utf-8 -*- 2 3 import scrapy 4 import json 5 import re 6 7 8 9 10 11 class ZhihuSpider(scrapy.Spider): 12 13 name = "zhihu" 14 allowed_domains = ["zhihu.com"] 15 start_urls = ['http://www.zhihu.com/'] 16 #頭部 17 headers = { 18 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 19 "Host":"www.zhihu.com", 20 "Referer":"https://www.zhihu.com/", 21 } 22 #從已經登錄的瀏覽在中copy下來的 23 cookies = { 24 "d_c0":"", 25 "l_cap_id":"", 26 "r_cap_id":"", 27 "cap_id":"", 28 "_zap":"", 29 "__utmc":"", 30 "__utmb":"", 31 "__utmv":"", 32 "__utma":"", 33 "__utmz":"5", 34 "q_c1":"", 35 } 36 #最開始請求的reqeust函數,自動調用,將首次獲取的response返回給登錄函數(裏面有xsrf) 37 def start_requests(self): 38 #必須帶上cookie;return返回,不用生成器,只需爬取登錄頁面一次,並且必須返回一個可迭代對象,因此是列表 39 return [scrapy.Request(url="https://www.zhihu.com/#signin",cookies=self.cookies,headers=self.headers,callback=self.login)] 40 41 42 #知乎登陸 43 def login(self,response): 44 #正則匹配出xsrf 45 response_text = response.text 46 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) 47 if match_obj: 48 xsrf = (match_obj.group(1)) 49 50 url = "https://www.zhihu.com/login/phone_num" 51 data={ 52 "_xsrf":xsrf, 53 'remember_me': 'true', 54 "password":"", 55 "phone_num":"" 56 } 57 58 #將獲取到的xsrf加載到cookie中 59 self.cookies["_xsrf"] = xsrf 60 #經過FormRequest提交表單,這裏的request對象和以前的session同樣,仍是處於剛剛的對話中;回調給檢查登錄的函數 61 return [scrapy.FormRequest(url=url,headers=self.headers,formdata=data,callback=self.check_login)] 62 63 #查看登陸狀態;登錄成功則默認回調parse函數進行解析網頁 64 def check_login(self,response): 65 text_json = json.load(response.text) 66 if "msg" in text_json and text_json["msg"]=="\u767b\u5f55\u6210\u529f": 67 for urls in self.start_urls: 68 yield scrapy.Request(url=urls,dont_filter=True,headers=self.headers) 69 70 71 72 73 def parse(self, response): 74 pass