scrapy genspider zhihu www.zhihu.com
1 #coding:utf-8 2 3 from scrapy.cmdline import execute #調用這個函數能夠執行scrapy的腳本 4 5 import sys 6 import os 7 #獲取當前路徑os模塊的abspath 8 os.path.abspath(__file__)#獲取當前py文件即mainpy文件的路徑 9 #父目錄dirname 10 sys.path.append(os.path.dirname(os.path.abspath(__file__))) 11 #調用execute函數執行scrapy命令 12 execute(["scrapy","crawl","zhihu"])
ROBOTSTXT_OBEY = False
match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL)
1 # -*- coding: utf-8 -*- 2 import scrapy 3 import re 4 import json 5 6 class ZhihuSpider(scrapy.Spider): 7 name = 'zhihu' 8 allowed_domains = ['www.zhihu.com'] 9 start_urls = ['http://www.zhihu.com/'] 10 11 headers = { 12 "HOST": "www.zhihu.com", 13 "Referer": "https://www.zhihu.com", 14 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/57.0" 15 } 16 17 def parse(self, response): 18 pass 19 20 def start_requests(self): 21 return [scrapy.Request('https://www.zhihu.com/signup?next=%2F', callback=self.login, headers=self.headers)] 22 23 def login(self, response): 24 response_text = response.text 25 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL) 26 xsrf = '' 27 if match_obj: 28 print (match_obj.group(1)) 29 else: 30 return "" 31 32 if xsrf: 33 post_url = "https://www.zhihu.com/signup?next=%2F" 34 post_data = { 35 "_xsrf": xsrf, 36 "phone_num": "15603367590", 37 "password":"0019wan,.WEI3618" 38 } 39 40 return [scrapy.FormRequest( 41 url = post_url, 42 formdata = post_data, 43 headers = self.headers, 44 callback = self.check_login #傳遞的是函數名稱,不加括號,加括號會被調用 45 )] 46 47 def check_login(self, response): 48 #驗證服務器返回數據判斷是否成功 49 text_jason = json.loads(response.text) 50 if "msg" in text_jason and text_jason["msg"] == "登錄成功": 51 for url in self.start_urls: 52 yield self.make_requests_from_url(url, dont_filter = True, headers = self.headers)