Scrapy（一）爬知乎全部用戶信息

時間 2019-11-29

標籤 scrapy 全部用戶信息欄目 Python 简体版

原文原文鏈接

1、Scrapy安裝(Windows）。html

須要到https://www.lfd.uci.edu/~gohlke/pythonlibs/下載Twisted手動安裝。
pip install pywin32.
pip install scrapy.

　　在CMD下執行如下命令驗證是否安裝完成。python

scrapy startproject myjob
cd myjob
scrapy genspider baidu www.baidu.com
scrapy crawl baidu

　　沒有報錯說明安裝成功。數據庫

2、開始一個projectjson

　　scrapy startproject zhihuapi

　　查看相應目錄，在pycharm打開。app

　　setting.pydom

# 默認爲True，這個規則可能會影響爬取，先改爲False
ROBOTSTXT_OBEY = False

# 把Headers打開
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}

　　items.pyscrapy

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field

class UsersItem(Item):
    answer_count = Field()
    articles_count = Field()
    avatar_url = Field()
    avatar_url_template = Field()
    badge = Field()
    follower_count = Field()
    gender = Field()
    headline = Field()
    id = Field()
    is_advertiser = Field()
    is_followed = Field()
    is_following = Field()
    is_org = Field()
    name = Field()
    type = Field()
    url = Field()
    url_token = Field()
    user_type = Field()

　　zhihu.pyide

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
import json
import re
from ..items import UsersItem
f = open('F:\zhihu.txt', 'w', encoding='utf-8')

class UserinfoSpider(Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_user = 'kaifulee'
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        # 起始用戶頁，關注頁
        # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20'
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query),
                      callback=self.user_parse)
        yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follow_parse)

    def user_parse(self, response):
        result = json.loads(response.text)
        # 獲取用戶信息
        item = UsersItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follow_parse)

    def follow_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏做為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])
                f.writelines(result['name']+' | ')
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 多是知乎為了防止爬蟲程序，這裏獲得的下一頁url不能訪問，因此就用正則抓取url_token和offset值，傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follow_parse)

直接在pycharm上的terminal運行函數

打開zhihu.txt看到確實有抓到不少用戶，並且沒有使用代理試跑了半小時也沒有被知乎彈驗證碼阻止。

3、同時爬取粉絲頁並傳入Mongodb

　　前面是從我關注的人頁面爬url_token，固然也能夠同時從關注個人人頁面爬url_token，只須要在followees的url下面加一個followers的url，include都是同樣的不用改，再加一個follower_pase函數就行了。

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
import json
import re
from ..items import UsersItemclass UserinfoSpider(Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_user = 'kaifulee'
    # 用戶信息url
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    # 他關注的人url
    follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    # 關注他的人url
    follower_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'


    def start_requests(self):
        # 起始用戶頁，關注頁
        # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20'
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query),
                      callback=self.user_parse)
        yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follow_parse)
        yield Request(self.follower_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follower_parse)

    def user_parse(self, response):
        result = json.loads(response.text)
        # 獲取用戶信息
        item = UsersItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follow_parse)
        yield Request(self.follower_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follower_parse)

    def follow_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏做為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 多是知乎為了防止爬蟲程序，這裏獲得的下一頁url不能訪問，因此就用正則抓取url_token和offset值，傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follow_parse)

    def follower_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏做為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])
                f.writelines(result['name']+' | ')
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 多是知乎為了防止爬蟲程序，這裏獲得的下一頁url不能訪問，因此就用正則抓取url_token和offset值，傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follower_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follower_parse)

　　把結果保存到Mongdb中，從scrapy官方文檔中copy Mongodb的Demo，直接用就行了，就是寫入數據庫的代碼要改一下，在存入前去重。

　　官方文檔https://doc.scrapy.org/en/latest/topics/item-pipeline.html

　　pipelines.py

import pymongo

class MongoPipeline(object):

    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db['users'].update({'url_token': item['url_token']}, {'$set': item}, True)
        return item

　　settings.py把pipelines打開，並加上Mongodb的配置

ITEM_PIPELINES = {
   'first_scrapy.pipelines.MongoPipeline': 300,
}


MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'

　　爲何Mongodb用的是這兩個變量名？由於MongoPipline裏面寫的是從setting裏面拿這兩個變量，因此必須是這兩個變量名，若是要更改須要連pipelines裏面的名字一塊兒改。

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )