Scrapy(一)爬知乎全部用戶信息

1、Scrapy安裝(Windows)。html

  1. 須要到https://www.lfd.uci.edu/~gohlke/pythonlibs/下載Twisted手動安裝。
  2. pip install pywin32.
  3. pip install scrapy.

  在CMD下執行如下命令驗證是否安裝完成。python

  1. scrapy startproject myjob
  2. cd myjob
  3. scrapy genspider baidu www.baidu.com
  4. scrapy crawl baidu

  沒有報錯說明安裝成功。數據庫

2、開始一個projectjson

  scrapy startproject zhihuapi

  查看相應目錄,在pycharm打開。app

  setting.pydom

# 默認爲True,這個規則可能會影響爬取,先改爲False
ROBOTSTXT_OBEY = False
# 把Headers打開 DEFAULT_REQUEST_HEADERS
= { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }

  items.pyscrapy

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field

class UsersItem(Item):
    answer_count = Field()
    articles_count = Field()
    avatar_url = Field()
    avatar_url_template = Field()
    badge = Field()
    follower_count = Field()
    gender = Field()
    headline = Field()
    id = Field()
    is_advertiser = Field()
    is_followed = Field()
    is_following = Field()
    is_org = Field()
    name = Field()
    type = Field()
    url = Field()
    url_token = Field()
    user_type = Field()

 

  zhihu.pyide

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
import json
import re
from ..items import UsersItem
f = open('F:\zhihu.txt', 'w', encoding='utf-8')

class UserinfoSpider(Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_user = 'kaifulee'
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'

    def start_requests(self):
        # 起始用戶頁,關注頁
        # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20'
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query),
                      callback=self.user_parse)
        yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follow_parse)

    def user_parse(self, response):
        result = json.loads(response.text)
        # 獲取用戶信息
        item = UsersItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follow_parse)

    def follow_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏做為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])
                f.writelines(result['name']+' | ')
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 多是知乎為了防止爬蟲程序,這裏獲得的下一頁url不能訪問,因此就用正則抓取url_token和offset值,傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follow_parse)

直接在pycharm上的terminal運行函數

打開zhihu.txt看到確實有抓到不少用戶,並且沒有使用代理試跑了半小時也沒有被知乎彈驗證碼阻止。

 

3、同時爬取粉絲頁並傳入Mongodb

  前面是從我關注的人頁面爬url_token,固然也能夠同時從關注個人人頁面爬url_token,只須要在followees的url下面加一個followers的url,include都是同樣的不用改,再加一個follower_pase函數就行了。

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
import json
import re
from ..items import UsersItemclass UserinfoSpider(Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_user = 'kaifulee'
    # 用戶信息url
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    # 他關注的人url
    follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    # 關注他的人url
    follower_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'


    def start_requests(self):
        # 起始用戶頁,關注頁
        # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20'
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query),
                      callback=self.user_parse)
        yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follow_parse)
        yield Request(self.follower_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20),
                      callback=self.follower_parse)

    def user_parse(self, response):
        result = json.loads(response.text)
        # 獲取用戶信息
        item = UsersItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follow_parse)
        yield Request(self.follower_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0),
                      callback=self.follower_parse)

    def follow_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏做為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 多是知乎為了防止爬蟲程序,這裏獲得的下一頁url不能訪問,因此就用正則抓取url_token和offset值,傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follow_parse)

    def follower_parse(self, response):
        results = json.loads(response.text)
        # 獲取用戶url_token參數傳入用戶頁中進行迭代
        if 'data' in results.keys():
            # 這裏做為驗證把用戶名保存到txt文檔中
            for result in results.get('data'):
                print(result['name'])
                f.writelines(result['name']+' | ')
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.user_parse)
        # 用戶關注列表頁翻頁迭代
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page)
            # 多是知乎為了防止爬蟲程序,這裏獲得的下一頁url不能訪問,因此就用正則抓取url_token和offset值,傳入url中實現翻頁
            print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2)))
            user = num_re.group(1)
            offset = num_re.group(2)
            next_page_url = self.follower_url.format(user=user, include=self.follow_query, limit=20, offset=offset)
            print('next_page_url={}'.format(next_page_url))
            yield Request(next_page_url, self.follower_parse)

  把結果保存到Mongdb中,從scrapy官方文檔中copy Mongodb的Demo,直接用就行了,就是寫入數據庫的代碼要改一下,在存入前去重。

  官方文檔https://doc.scrapy.org/en/latest/topics/item-pipeline.html

  

  pipelines.py

import pymongo

class MongoPipeline(object):

    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db['users'].update({'url_token': item['url_token']}, {'$set': item}, True)
        return item

  settings.py把pipelines打開,並加上Mongodb的配置

ITEM_PIPELINES = {
   'first_scrapy.pipelines.MongoPipeline': 300,
}


MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'

  爲何Mongodb用的是這兩個變量名?由於MongoPipline裏面寫的是從setting裏面拿這兩個變量,因此必須是這兩個變量名,若是要更改須要連pipelines裏面的名字一塊兒改。

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

  Mongodb上的結果

 

  我在一個地方沒用代理驗證了半小時,抓到了2w多條數據都沒有出現驗證碼,在另外一個地方用代理爬了4000多條數據就出現驗證碼的問題了,驗證碼問題能夠用Flask維護的動態代理池來解決。

相關文章
相關標籤/搜索