在上一篇文章中主要寫了關於爬蟲過程的分析,下面是代碼的實現,完整代碼在:
https://github.com/pythonsite/spiderpython
class UserItem(scrapy.Item): id = Field() name = Field() account_status = Field() allow_message= Field() answer_count = Field() articles_count = Field() avatar_hue = Field() avatar_url = Field() avatar_url_template = Field() badge = Field() business = Field() employments = Field() columns_count = Field() commercial_question_count = Field() cover_url = Field() description = Field() educations = Field() favorite_count = Field() favorited_count = Field() follower_count = Field() following_columns_count = Field() following_favlists_count = Field() following_question_count = Field() following_topic_count = Field() gender = Field() headline = Field() hosted_live_count = Field() is_active = Field() is_bind_sina = Field() is_blocked = Field() is_advertiser = Field() is_blocking = Field() is_followed = Field() is_following = Field() is_force_renamed = Field() is_privacy_protected = Field() locations = Field() is_org = Field() type = Field() url = Field() url_token = Field() user_type = Field() logs_count = Field() marked_answers_count = Field() marked_answers_text = Field() message_thread_token = Field() mutual_followees_count = Field() participated_live_count = Field() pins_count = Field() question_count = Field() show_sina_weibo = Field() thank_from_count = Field() thank_to_count = Field() thanked_count = Field() type = Field() vote_from_count = Field() vote_to_count = Field() voteup_count = Field()
這些字段的是在用戶詳細信息裏找到的,以下圖所示,這裏一共有58個字段,能夠詳細研究每一個字段表明的意思:git
這段代碼是很是重要的,主要的處理邏輯其實都是在這裏github
class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] start_urls = ['http://www.zhihu.com/'] #這裏定義一個start_user存儲咱們找的大V帳號 start_user = "excited-vczh" #這裏把查詢的參數單獨存儲爲user_query,user_url存儲的爲查詢用戶信息的url地址 user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}" user_query = "locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics" #follows_url存儲的爲關注列表的url地址,fllows_query存儲的爲查詢參數。這裏涉及到offset和limit是關於翻頁的參數,0,20表示第一頁 follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}" follows_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics" #followers_url是獲取粉絲列表信息的url地址,followers_query存儲的爲查詢參數。 followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}" followers_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics" def start_requests(self): ''' 這裏重寫了start_requests方法,分別請求了用戶查詢的url和關注列表的查詢以及粉絲列表信息查詢 :return: ''' yield Request(self.user_url.format(user=self.start_user,include=self.user_query),callback=self.parse_user) yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows) yield Request(self.followers_url.format(user=self.start_user,include=self.followers_query,offset=0,limit=20),callback=self.parse_followers) def parse_user(self, response): ''' 由於返回的是json格式的數據,因此這裏直接經過json.loads獲取結果 :param response: :return: ''' result = json.loads(response.text) item = UserItem() #這裏循環判斷獲取的字段是否在本身定義的字段中,而後進行賦值 for field in item.fields: if field in result.keys(): item[field] = result.get(field) #這裏在返回item的同時返回Request請求,繼續遞歸拿關注用戶信息的用戶獲取他們的關注列表 yield item yield Request(self.follows_url.format(user = result.get("url_token"),include=self.follows_query,offset=0,limit=20),callback=self.parse_follows) yield Request(self.followers_url.format(user = result.get("url_token"),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers) def parse_follows(self, response): ''' 用戶關注列表的解析,這裏返回的也是json數據 這裏有兩個字段data和page,其中page是分頁信息 :param response: :return: ''' results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user) #這裏判斷page是否存在而且判斷page裏的參數is_end判斷是否爲False,若是爲False表示不是最後一頁,不然則是最後一頁 if 'page' in results.keys() and results.get('is_end') == False: next_page = results.get('paging').get("next") #獲取下一頁的地址而後經過yield繼續返回Request請求,繼續請求本身再次獲取下頁中的信息 yield Request(next_page,self.parse_follows) def parse_followers(self, response): ''' 這裏其實和關乎列表的處理方法是同樣的 用戶粉絲列表的解析,這裏返回的也是json數據 這裏有兩個字段data和page,其中page是分頁信息 :param response: :return: ''' results = json.loads(response.text) if 'data' in results.keys(): for result in results.get('data'): yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user) #這裏判斷page是否存在而且判斷page裏的參數is_end判斷是否爲False,若是爲False表示不是最後一頁,不然則是最後一頁 if 'page' in results.keys() and results.get('is_end') == False: next_page = results.get('paging').get("next") #獲取下一頁的地址而後經過yield繼續返回Request請求,繼續請求本身再次獲取下頁中的信息 yield Request(next_page,self.parse_followers)
上述的代碼的主要邏輯用下圖分析表示:mongodb
關於上圖的一個簡單描述:
1. 當重寫start_requests,一會有三個yield,分別的回調函數調用了parse_user,parse_follows,parse_followers,這是第一次會分別獲取咱們所選取的大V的信息以及關注列表信息和粉絲列表信息
2. 而parse分別會再次回調parse_follows和parse_followers信息,分別遞歸獲取每一個用戶的關注列表信息和分析列表信息
3. parse_follows獲取關注列表裏的每一個用戶的信息回調了parse_user,並進行翻頁獲取回調了本身parse_follows
4. parse_followers獲取粉絲列表裏的每一個用戶的信息回調了parse_user,並進行翻頁獲取回調了本身parse_followers數據庫
經過上面的步驟實現全部用戶信息的爬取,最後是關於數據的存儲json
這裏主要是item中的數據存儲到mongodb數據庫中,這裏主要的一個用法是就是插入的時候進行了一個去重檢測api
class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): #這裏經過mongodb進行了一個去重的操做,每次更新插入數據以前都會進行查詢,判斷要插入的url_token是否已經存在,若是不存在再進行數據插入,不然放棄數據 self.db['user'].update({'url_token':item["url_token"]},{'$set':item},True) return item