代碼以下:json
# -*- coding: utf-8 -*-import scrapyfrom scrapy import spiders ,Requestimport jsonclass ZhihuuserSpider(scrapy.Spider): name = 'z' allowed_domains = ['www.zhihu.com'] start_urls = ['https://www.zhihu.com/'] start_url = 'excited-vczh' allow_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={off}&limit={lim}' exit_url = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' qwe_url = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' one_url = 'https://www.zhihu.com/api/v4/members/{user}?include={two_url}' def start_requests(self): yield Request(self.one_url.format(user=self.start_url,two_url=self.qwe_url),self.parse) def parse(self, response): result = json.loads(response.text) yield Request(self.allow_url.format(user=result.get('url_token'), include=self.exit_url, off=20, lim=20), self.parse_xx) def parse_xx(self,response): result = json.loads(response.text) for cc in result.get('data'): yield Request(self.one_url.format(user=cc.get('url_token'),two_url=self.qwe_url),self.parse) if 'paging' in result.keys() and result.get('paging').get('is_end') == False: next_page = result.get('paging').get('next') yield Request(next_page,self.parse_xx) #還能夠增長個函數爬取關注者列表,基本就是複製parse——xx 我的認爲爬取知乎仍是很簡單的,可是有些地方不是很理解,這個爬蟲參考的仍是崔慶才大神的教程,感受裏面會重複不少次請求,浪費不少時間。。。,但願有大神能指導下我如何改進。。。我對於pipelines仍是隻知其一;不知其二,主要是對裝飾器不懂,下一步目標是攻克pipelines