Python之爬蟲(二十一) Scrapy爬取全部知乎用戶信息(下)

在上一篇文章中主要寫了關於爬蟲過程的分析,下面是代碼的實現,完整代碼在:
https://github.com/pythonsite/spiderpython

items中的代碼主要是咱們要爬取的字段的定義

class UserItem(scrapy.Item):
    id = Field()
    name = Field()
    account_status = Field()
    allow_message= Field()
    answer_count = Field()
    articles_count = Field()
    avatar_hue = Field()
    avatar_url = Field()
    avatar_url_template = Field()
    badge = Field()
    business = Field()
    employments = Field()
    columns_count = Field()
    commercial_question_count = Field()
    cover_url = Field()
    description = Field()
    educations = Field()
    favorite_count = Field()
    favorited_count = Field()
    follower_count = Field()
    following_columns_count = Field()
    following_favlists_count = Field()
    following_question_count = Field()
    following_topic_count = Field()
    gender = Field()
    headline = Field()
    hosted_live_count = Field()
    is_active = Field()
    is_bind_sina = Field()
    is_blocked = Field()
    is_advertiser = Field()
    is_blocking = Field()
    is_followed = Field()
    is_following = Field()
    is_force_renamed = Field()
    is_privacy_protected = Field()
    locations = Field()
    is_org = Field()
    type = Field()
    url = Field()
    url_token = Field()
    user_type = Field()
    logs_count = Field()
    marked_answers_count = Field()
    marked_answers_text = Field()
    message_thread_token = Field()
    mutual_followees_count = Field()
    participated_live_count = Field()
    pins_count = Field()
    question_count = Field()
    show_sina_weibo = Field()
    thank_from_count = Field()
    thank_to_count = Field()
    thanked_count = Field()
    type = Field()
    vote_from_count = Field()
    vote_to_count = Field()
    voteup_count = Field()

這些字段的是在用戶詳細信息裏找到的,以下圖所示,這裏一共有58個字段,能夠詳細研究每一個字段表明的意思:git

關於spiders中爬蟲文件zhihu.py中的主要代碼

這段代碼是很是重要的,主要的處理邏輯其實都是在這裏github

class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["www.zhihu.com"]
    start_urls = ['http://www.zhihu.com/']
    #這裏定義一個start_user存儲咱們找的大V帳號
    start_user = "excited-vczh"

    #這裏把查詢的參數單獨存儲爲user_query,user_url存儲的爲查詢用戶信息的url地址
    user_url = "https://www.zhihu.com/api/v4/members/{user}?include={include}"
    user_query = "locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics"

    #follows_url存儲的爲關注列表的url地址,fllows_query存儲的爲查詢參數。這裏涉及到offset和limit是關於翻頁的參數,0,20表示第一頁
    follows_url = "https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}"
    follows_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"

    #followers_url是獲取粉絲列表信息的url地址,followers_query存儲的爲查詢參數。
    followers_url = "https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}"
    followers_query = "data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"


    def start_requests(self):
        '''
        這裏重寫了start_requests方法,分別請求了用戶查詢的url和關注列表的查詢以及粉絲列表信息查詢
        :return:
        '''
        yield Request(self.user_url.format(user=self.start_user,include=self.user_query),callback=self.parse_user)
        yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
        yield Request(self.followers_url.format(user=self.start_user,include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)

    def parse_user(self, response):
        '''
        由於返回的是json格式的數據,因此這裏直接經過json.loads獲取結果
        :param response:
        :return:
        '''
        result = json.loads(response.text)
        item = UserItem()
        #這裏循環判斷獲取的字段是否在本身定義的字段中,而後進行賦值
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)

        #這裏在返回item的同時返回Request請求,繼續遞歸拿關注用戶信息的用戶獲取他們的關注列表
        yield item
        yield Request(self.follows_url.format(user = result.get("url_token"),include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
        yield Request(self.followers_url.format(user = result.get("url_token"),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)




    def parse_follows(self, response):
        '''
        用戶關注列表的解析,這裏返回的也是json數據 這裏有兩個字段data和page,其中page是分頁信息
        :param response:
        :return:
        '''
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user)

        #這裏判斷page是否存在而且判斷page裏的參數is_end判斷是否爲False,若是爲False表示不是最後一頁,不然則是最後一頁
        if 'page' in results.keys() and results.get('is_end') == False:
            next_page = results.get('paging').get("next")
            #獲取下一頁的地址而後經過yield繼續返回Request請求,繼續請求本身再次獲取下頁中的信息
            yield Request(next_page,self.parse_follows)

    def parse_followers(self, response):
        '''
        這裏其實和關乎列表的處理方法是同樣的
        用戶粉絲列表的解析,這裏返回的也是json數據 這裏有兩個字段data和page,其中page是分頁信息
        :param response:
        :return:
        '''
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user)

        #這裏判斷page是否存在而且判斷page裏的參數is_end判斷是否爲False,若是爲False表示不是最後一頁,不然則是最後一頁
        if 'page' in results.keys() and results.get('is_end') == False:
            next_page = results.get('paging').get("next")
            #獲取下一頁的地址而後經過yield繼續返回Request請求,繼續請求本身再次獲取下頁中的信息
            yield Request(next_page,self.parse_followers)

上述的代碼的主要邏輯用下圖分析表示:mongodb

 

關於上圖的一個簡單描述:
1. 當重寫start_requests,一會有三個yield,分別的回調函數調用了parse_user,parse_follows,parse_followers,這是第一次會分別獲取咱們所選取的大V的信息以及關注列表信息和粉絲列表信息
2. 而parse分別會再次回調parse_follows和parse_followers信息,分別遞歸獲取每一個用戶的關注列表信息和分析列表信息
3. parse_follows獲取關注列表裏的每一個用戶的信息回調了parse_user,並進行翻頁獲取回調了本身parse_follows
4. parse_followers獲取粉絲列表裏的每一個用戶的信息回調了parse_user,並進行翻頁獲取回調了本身parse_followers數據庫

經過上面的步驟實現全部用戶信息的爬取,最後是關於數據的存儲json

關於數據存儲到mongodb

這裏主要是item中的數據存儲到mongodb數據庫中,這裏主要的一個用法是就是插入的時候進行了一個去重檢測api

class MongoPipeline(object):

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        #這裏經過mongodb進行了一個去重的操做,每次更新插入數據以前都會進行查詢,判斷要插入的url_token是否已經存在,若是不存在再進行數據插入,不然放棄數據
        self.db['user'].update({'url_token':item["url_token"]},{'$set':item},True)
        return item
相關文章
相關標籤/搜索