scrapy 知乎的模擬登錄及抓取用戶數據

最近看了python的scrapy 框架並用其抓取了部分知乎用戶數據,代碼主要是集中在知乎登錄和抓取時候的邏輯處理上。html

一、 首先進入知乎登錄頁面zhihu.com/#sigin上, 用xpath提取_xsrf參數, 獲取驗證碼的部分url,完整的url是由當前的時間戳和type參數構成,利用獲得的url造成response, 在函數handle_captcha對驗證碼提取並提示在終端輸入驗證碼,最後再將登錄的url、cookie、用戶帳號、密碼什麼的from進去就能夠登錄成功了。下面是代碼:node

# _*_coding:utf-8_*_

from scrapy.spider import CrawlSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import Selector
from zhihu2 import config
from PIL import Image
import time
import json
import re
from zhihu2 import items


class ZhiHu_spider(CrawlSpider):
    name = 'zhihu2'
    allowed_domain = ['https://www.zhihu.com']

    def __init__(self, *args, **kwargs):
        super(ZhiHu_spider, self).__init__(*args, **kwargs)
        self.xsrf = ''
        self.headers = config.headers

    def start_requests(self):
        yield Request(
            'http://www.zhihu.com/#signin',
            meta={
                'cookiejar': 1
            },
            callback=self.post_login
        )

    def post_login(self, response):
        print 'parper login in '
        sel = Selector(response)
        self.xsrf = sel.xpath('//input[@name="_xsrf"]/@value').extract()[0]

        #驗證碼的獲取 沒有自動識別 識別率過低 因此手打
        str_time = str(time.time() * 1000)
        cap_url = 'https://www.zhihu.com/captcha.gif?r=' + str_time + '&type=login'
        print cap_url
        yield Request(
            cap_url,
            meta={'cookiejar': response.meta['cookiejar'],
                  '_xsrf': self.xsrf,
                  },

            headers=self.headers,
            callback=self.handle_captcha

        )

    def handle_captcha(self, response):
        with open('E:\\myscrapy\\captcha.gif', 'wb') as gif:
            gif.write(response.body)
        gif.close()
        Im = Image.open('E:\\myscrapy\\captcha.gif')
        Im.show()
        captcha = raw_input('enter your captcha:')

        yield FormRequest(
            'http://www.zhihu.com/login/phone_num', #s手機號登錄, 對應的能夠換成郵箱
            method='POST',
            meta={'cookiejar': response.meta['cookiejar']},
            headers=self.headers,
            formdata={
                '_xsrf': self.xsrf,
                'password': '密碼',
                'remember_me': 'true',
                'phone_num': '帳號',
                'captcha': captcha
            },
            callback=self.after_login,

        )

二、下面是登錄以後獲取關注人的信息,因爲知乎第一次只會顯示20個關注人,剩下的要post數據到www.zhihu.com/node/ProfileFolloweesList2python

才能又獲取20個,因此在這要獲取每一個人的關注人數並與20作對比。web

 # 獲取我的主頁
    def after_login(self, response):
        print response.body
        print 'login success'
        yield Request(
            'https://www.zhihu.com/people/你的id須要填寫, #本身主頁的網址 由於我沒獲取id 因此要輸入本身主頁的網址
            meta={'cookiejar': response.meta['cookiejar']},
            headers=self.headers,
            callback=self.parse_people,
        ) 
  #獲取關注人url
    def parse_people(self, response):
        # print 'ready'
        sel = Selector(response)
        follow_url = sel.xpath('//a[@class="item"]/@href').extract_first()
        if follow_url:
            compete_url = 'https://www.zhihu.com' + follow_url

            yield Request(
                compete_url,
                meta={
                    'cookiejar': response.meta['cookiejar'],
                      },
                headers=self.headers,
                callback=self.person_info,
        )

    #處理關注人的url 並獲取信息
    def person_info(self, response):
        item = items.Zhihu2Item()
        count = 20
        sel = Selector(response)

        nikname = sel.xpath('//div[@class="title-section"]/a[@class="name"]/text()').extract_first()
        location = sel.xpath('//span[@class="location item"]/@title').extract_first()
        business = sel.xpath('//span[@class="business item"]/@title').extract_first()
        education = sel.xpath('//span[@class="education item"]/@title').extract_first()
        education_extra = sel.xpath('//span[@class="education-extra item"]/@title').extract_first()
        sex = sel.xpath('//span[@class="item gender"]/i/@class').extract_first().split('-')[-1]
        agree = sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract_first()
        thanks = sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract_first()

        config.try_none(nikname)
        config.try_none(location)
        config.try_none(business)
        config.try_none(education)
        config.try_none(education_extra)
        config.try_none(sex)
        config.try_none(agree)
        config.try_none(thanks)

        peo_num = sel.xpath('/html/body/div[3]/div[2]/div[1]/a[1]/strong/text()').extract_first()
        item['nikname'] = nikname
        item['business'] = business
        item['education_extra'] = education_extra
        item['location'] = location
        item['education'] =education
        item['sex'] = sex
        item['agree'] = agree
        item['thanks'] = thanks

        if peo_num:

            people_urls = sel.xpath('//a[@class="zg-link author-link"]/@href').extract()
            for people_url in people_urls:
             yield Request(
                people_url,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,
                callback=self.person_info
            )

            peo_params = sel.xpath('//div[@class="zh-general-list clearfix"]/@data-init').extract_first()
            if peo_params:
                try:
                    values = json.loads(str(peo_params))
                except ValueError, e:
                    print e.message
                params = {}
                params['offset'] = 20
                params['order_by'] = 'created'
                params['hash_id'] = values['params']['hash_id']

                if count < peo_num:
                    params['offset'] = count
                    yield FormRequest(
                        'https://www.zhihu.com/node/ProfileFolloweesListV2',
                        method='POST',
                        meta={'cookiejar': response.meta['cookiejar']},
                        headers=self.headers,
                        formdata={
                            'method': 'next',
                            'params': json.dumps(params),
                            '_xsrf': self.xsrf,
                        },
                        callback=self.foolows_V2
                    )
                    count += 20
                else:
                    num = peo_num / 20
                    params['offset'] = num
                yield FormRequest(
                    'https://www.zhihu.com/node/ProfileFolloweesListV2',
                    method='POST',
                    meta={'cookiejar': response.meta['cookiejar']},
                    headers=self.headers,
                    formdata={
                        'method': 'next',
                        'params': json.dumps(params),
                        '_xsrf': self.xsrf,
                    },
                    callback=self.foolows_V2
                )
View Code

三、從上面url的response獲取關注人的url,獲得的url 交由parse_people函數處理,parse_people函數的response交由person_info函數處理,因此就造成了一個循環,不斷的有url被提取,也不斷的有數據被提取出來,下面是parse_people函數的代碼:json

    def foolows_V2(self, response):
        p = re.compile(r'href="https://www\.zhihu\.com/people/(.*?)"')

        aa = json.loads(response.body)['msg']
        for item in aa:
            peo = p.search(item).group(1)
            followes_url = 'https://www.zhihu.com/people/' + str(peo)
            yield Request(
                followes_url,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,
                callback=self.parse_people
            )
View Code

下面是一些配置信息:cookie

cofig.pyapp

#_*_coding:utf-8_*_

from settings import USER_AGENT

headers = {

    'Host': 'www.zhihu.com',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Origin': 'https://www.zhihu.com',
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': USER_AGENT,
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Referer': 'https://www.zhihu.com/',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8',
}


def try_none(tag):
    try:
        tag
    except:
        tag = 'none'
    return tag
View Code

items.py:框架

from scrapy import Item, Field


class Zhihu2Item(Item):
    nikname = Field()
    location = Field()
    business = Field()
    education = Field()
    education_extra = Field()
    sex = Field()
    thanks = Field()
    agree = Field()
View Code

代碼沒有維護已爬取的url和帶爬取的url的重複,可能會致使重複抓取,代碼的優化也挺爛的。但願大神們多給點意見,若是代碼有錯誤,但願提出,以避免給別人誤導。dom

相關文章
相關標籤/搜索