Scrapy爬蟲 - 獲取知乎用戶數據

時間 2019-11-16

標籤 scrapy 爬蟲獲取用戶數據欄目 Python 简体版

原文原文鏈接

2016-04-10css

Scrapy爬蟲 - 獲取知乎用戶數據

安裝Scrapy爬蟲框架

關於如何安裝Python以及Scrapy框架，這裏不作介紹，請自行網上搜索。html

初始化

安裝好Scrapy後，執行 scrapy startproject myspider
接下來你會看到 myspider 文件夾，目錄結構以下：mysql

scrapy.cfggit
myspidergithub
- items.pyweb
- pipelines.pysql
- settings.py數據庫
- __init__.pycookie
- spidersapp
  - __init__.py

編寫爬蟲文件

在spiders目錄下新建 users.py

# -*- coding: utf-8 -*-
import scrapy
import os
import time
from zhihu.items import UserItem
from zhihu.myconfig import UsersConfig # 爬蟲配置

class UsersSpider(scrapy.Spider):
    name = 'users'
    domain = 'https://www.zhihu.com'
    login_url = 'https://www.zhihu.com/login/email'
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Connection": "keep-alive",
        "Host": "www.zhihu.com",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36"
    }

    def __init__(self, url = None):
        self.user_url = url

    def start_requests(self):
        yield scrapy.Request(
            url = self.domain,
            headers = self.headers,
            meta = {
                'proxy': UsersConfig['proxy'],
                'cookiejar': 1
            },
            callback = self.request_captcha
        )

    def request_captcha(self, response):
        # 獲取_xsrf值
        _xsrf = response.css('input[name="_xsrf"]::attr(value)').extract()[0]
        # 獲取驗證碼地址
        captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000)
        # 準備下載驗證碼
        yield scrapy.Request(
            url = captcha_url,
            headers = self.headers,
            meta = {
                'proxy': UsersConfig['proxy'],
                'cookiejar': response.meta['cookiejar'],
                '_xsrf': _xsrf
            },
            callback = self.download_captcha
        )

    def download_captcha(self, response):
        # 下載驗證碼
        with open('captcha.gif', 'wb') as fp:
            fp.write(response.body)
        # 用軟件打開驗證碼圖片
        os.system('start captcha.gif')
        # 輸入驗證碼
        print 'Please enter captcha: '
        captcha = raw_input()

        yield scrapy.FormRequest(
            url = self.login_url,
            headers = self.headers,
            formdata = {
                'email': UsersConfig['email'],
                'password': UsersConfig['password'],
                '_xsrf': response.meta['_xsrf'],
                'remember_me': 'true',
                'captcha': captcha
            },
            meta = {
                'proxy': UsersConfig['proxy'],
                'cookiejar': response.meta['cookiejar']
            },
            callback = self.request_zhihu
        )

    def request_zhihu(self, response):
        yield scrapy.Request(
            url = self.user_url + '/about',
            headers = self.headers,
            meta = {
                'proxy': UsersConfig['proxy'],
                'cookiejar': response.meta['cookiejar'],
                'from': {
                    'sign': 'else',
                    'data': {}
                }
            },
            callback = self.user_item,
            dont_filter = True
        )

        yield scrapy.Request(
            url = self.user_url + '/followees',
            headers = self.headers,
            meta = {
                'proxy': UsersConfig['proxy'],
                'cookiejar': response.meta['cookiejar'],
                'from': {
                    'sign': 'else',
                    'data': {}
                }
            },
            callback = self.user_start,
            dont_filter = True
        )

        yield scrapy.Request(
            url = self.user_url + '/followers',
            headers = self.headers,
            meta = {
                'proxy': UsersConfig['proxy'],
                'cookiejar': response.meta['cookiejar'],
                'from': {
                    'sign': 'else',
                    'data': {}
                }
            },
            callback = self.user_start,
            dont_filter = True
        )

    def user_start(self, response):
        sel_root = response.xpath('//h2[@class="zm-list-content-title"]')
        # 判斷關注列表是否爲空
        if len(sel_root):
            for sel in sel_root:
                people_url = sel.xpath('a/@href').extract()[0]

                yield scrapy.Request(
                    url = people_url + '/about',
                    headers = self.headers,
                    meta = {
                        'proxy': UsersConfig['proxy'],
                        'cookiejar': response.meta['cookiejar'],
                        'from': {
                            'sign': 'else',
                            'data': {}
                        }
                    },
                    callback = self.user_item,
                    dont_filter = True
                )

                yield scrapy.Request(
                    url = people_url + '/followees',
                    headers = self.headers,
                    meta = {
                        'proxy': UsersConfig['proxy'],
                        'cookiejar': response.meta['cookiejar'],
                        'from': {
                            'sign': 'else',
                            'data': {}
                        }
                    },
                    callback = self.user_start,
                    dont_filter = True
                )

                yield scrapy.Request(
                    url = people_url + '/followers',
                    headers = self.headers,
                    meta = {
                        'proxy': UsersConfig['proxy'],
                        'cookiejar': response.meta['cookiejar'],
                        'from': {
                            'sign': 'else',
                            'data': {}
                        }
                    },
                    callback = self.user_start,
                    dont_filter = True
                )

    def user_item(self, response):
        def value(list):
            return list[0] if len(list) else ''

        sel = response.xpath('//div[@class="zm-profile-header ProfileCard"]')

        item = UserItem()
        item['url'] = response.url[:-6]
        item['name'] = sel.xpath('//a[@class="name"]/text()').extract()[0].encode('utf-8')
        item['bio'] = value(sel.xpath('//span[@class="bio"]/@title').extract()).encode('utf-8')
        item['location'] = value(sel.xpath('//span[contains(@class, "location")]/@title').extract()).encode('utf-8')
        item['business'] = value(sel.xpath('//span[contains(@class, "business")]/@title').extract()).encode('utf-8')
        item['gender'] = 0 if sel.xpath('//i[contains(@class, "icon-profile-female")]') else 1
        item['avatar'] = value(sel.xpath('//img[@class="Avatar Avatar--l"]/@src').extract())
        item['education'] = value(sel.xpath('//span[contains(@class, "education")]/@title').extract()).encode('utf-8')
        item['major'] = value(sel.xpath('//span[contains(@class, "education-extra")]/@title').extract()).encode('utf-8')
        item['employment'] = value(sel.xpath('//span[contains(@class, "employment")]/@title').extract()).encode('utf-8')
        item['position'] = value(sel.xpath('//span[contains(@class, "position")]/@title').extract()).encode('utf-8')
        item['content'] = value(sel.xpath('//span[@class="content"]/text()').extract()).strip().encode('utf-8')
        item['ask'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[2]/span[@class="num"]/text()').extract()[0])
        item['answer'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[3]/span[@class="num"]/text()').extract()[0])
        item['agree'] = int(sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract()[0])
        item['thanks'] = int(sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract()[0])

        yield item

添加爬蟲配置文件

在myspider目錄下新建myconfig.py，並添加如下內容，將你的配置信息填入相應位置

# -*- coding: utf-8 -*-
UsersConfig = {
    # 代理
    'proxy': '',

    # 知乎用戶名和密碼
    'email': 'your email',
    'password': 'your password',
}

DbConfig = {
    # db config
    'user': 'db user',
    'passwd': 'db password',
    'db': 'db name',
    'host': 'db host',
}

修改items.py

# -*- coding: utf-8 -*-
import scrapy

class UserItem(scrapy.Item):
    # define the fields for your item here like:
    url = scrapy.Field()
    name = scrapy.Field()
    bio = scrapy.Field()
    location = scrapy.Field()
    business = scrapy.Field()
    gender = scrapy.Field()
    avatar = scrapy.Field()
    education = scrapy.Field()
    major = scrapy.Field()
    employment = scrapy.Field()
    position = scrapy.Field()
    content = scrapy.Field()
    ask = scrapy.Field()
    answer = scrapy.Field()
    agree = scrapy.Field()
    thanks = scrapy.Field()

將用戶數據存入mysql數據庫

修改pipelines.py

# -*- coding: utf-8 -*-
import MySQLdb
import datetime
from zhihu.myconfig import DbConfig

class UserPipeline(object):
    def __init__(self):
        self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)
        self.cursor = self.conn.cursor()
        # 清空表
        # self.cursor.execute('truncate table weather;')
        # self.conn.commit()

    def process_item(self, item, spider):
        curTime = datetime.datetime.now()
        try:
            self.cursor.execute(
                """INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                (
                    item['url'],
                    item['name'],
                    item['bio'],
                    item['location'],
                    item['business'],
                    item['gender'],
                    item['avatar'],
                    item['education'],
                    item['major'],
                    item['employment'],
                    item['position'],
                    item['content'],
                    item['ask'],
                    item['answer'],
                    item['agree'],
                    item['thanks'],
                    curTime
                )
            )
            self.conn.commit()
        except MySQLdb.Error, e:
            print 'Error %d %s' % (e.args[0], e.args[1])

        return item

修改settings.py

找到 ITEM_PIPELINES，改成：

ITEM_PIPELINES = {
   'myspider.pipelines.UserPipeline': 300,
}

在末尾添加，設置爬蟲的深度

DEPTH_LIMIT=10

爬取知乎用戶數據

確保MySQL已經打開，在項目根目錄下打開終端，
執行 scrapy crawl users -a url=https://www.zhihu.com/people/<user>，
其中user爲爬蟲的第一個用戶，以後會根據該用戶關注的人和被關注的人進行爬取數據
接下來會下載驗證碼圖片，若未自動打開，請到根目錄下打開 captcha.gif，在終端輸入驗證碼
數據爬取Loading...