scrapy startproject zhihu_login scrapy genspider zhihu www.zhihu.com
http://www.zhihu.com/#signin
, 爲了方便重寫sart_requests
# -*- coding: utf-8 -*- import scrapy class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] def start_requests(self): # 返回值必須是一個序列 return [scrapy.Request('http://www.zhihu.com/#signin')] def parse(self, response): print response
測試能不能正確返回, 返回結果是html
[scrapy] DEBUG: Retrying <GET http://www.zhihu.com/robots.txt> (failed 1 times): 500 Internal Server Error
在settings中假如USER_AGENT再進行測試, 返回200, 說明是知乎驗證瀏覽器的問題, 到此能夠成功請求到python
DEBUG: Crawled (200) <GET http://www.zhihu.com/robots.txt> (referer: None)
_xsrf (在html中能夠找到) email password remember_me
# -*- coding: utf-8 -*- import scrapy class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] def start_requests(self): # 返回值必須是一個序列 return [scrapy.Request('http://www.zhihu.com/#signin', callback=self.login)] def login(self, response): print '-------' # 便於測試 _xsrf = response.xpath(".//*[@id='sign-form-1']/input[2]/@value").extract()[0] print _xsrf
使用FormRequest登陸json
def login(self, response): print '-------' # 便於測試 _xsrf = response.xpath(".//*[@id='sign-form-1']/input[2]/@value").extract()[0] print _xsrf return [scrapy.FormRequest( url = 'http://www.zhihu.com/login/email', # 這是post的真實地址 formdata={ '_xsrf': _xsrf, 'email': 'xxxxxxxx', # email 'password': 'xxxxxxxx', # password 'remember_me': 'true', }, headers=self.headers, callback=self.check_login, )]
def check_login(self, response): if json.loads(response.body)['r'] == 0: yield scrapy.Request( 'http://www.zhihu.com', headers=self.headers, callback=self.page_content, dont_filter=True, # 由於是第二次請求, 設置爲True, 默認是False, 不然報錯 )
# -*- coding: utf-8 -*- import scrapy import json class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] headers = { 'Host': 'www.zhihu.com', 'Referer': 'http://www.zhihu.com', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36', } def start_requests(self): # 返回值必須是一個序列 return [scrapy.Request('http://www.zhihu.com/#signin', callback=self.login)] def login(self, response): print '-------' # 便於測試 _xsrf = response.xpath(".//*[@id='sign-form-1']/input[2]/@value").extract()[0] print _xsrf return [scrapy.FormRequest( url = 'http://www.zhihu.com/login/email', # 這是post的真實地址 formdata={ '_xsrf': _xsrf, 'email': 'xxxxxxxx', # email 'password': 'xxxxxxxx', # password 'remember_me': 'true', }, headers=self.headers, callback=self.check_login, )] def check_login(self, response): if json.loads(response.body)['r'] == 0: yield scrapy.Request( 'http://www.zhihu.com', headers=self.headers, callback=self.page_content, dont_filter=True, ) def page_content(self, response): with open('first_page.html', 'wb') as f: f.write(response.body) print 'done'
注: 也是剛學scrapy, 暫時不知道怎麼處理驗證碼的狀況, 還望大牛指教瀏覽器