2016-04-10css
關於如何安裝Python以及Scrapy框架,這裏不作介紹,請自行網上搜索。html
安裝好Scrapy後,執行 scrapy startproject myspider
接下來你會看到 myspider 文件夾,目錄結構以下:mysql
scrapy.cfggit
myspidergithub
items.pyweb
pipelines.pysql
settings.py數據庫
__init__.pycookie
spidersapp
__init__.py
在spiders目錄下新建 users.py
# -*- coding: utf-8 -*- import scrapy import os import time from zhihu.items import UserItem from zhihu.myconfig import UsersConfig # 爬蟲配置 class UsersSpider(scrapy.Spider): name = 'users' domain = 'https://www.zhihu.com' login_url = 'https://www.zhihu.com/login/email' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Host": "www.zhihu.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36" } def __init__(self, url = None): self.user_url = url def start_requests(self): yield scrapy.Request( url = self.domain, headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': 1 }, callback = self.request_captcha ) def request_captcha(self, response): # 獲取_xsrf值 _xsrf = response.css('input[name="_xsrf"]::attr(value)').extract()[0] # 獲取驗證碼地址 captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000) # 準備下載驗證碼 yield scrapy.Request( url = captcha_url, headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], '_xsrf': _xsrf }, callback = self.download_captcha ) def download_captcha(self, response): # 下載驗證碼 with open('captcha.gif', 'wb') as fp: fp.write(response.body) # 用軟件打開驗證碼圖片 os.system('start captcha.gif') # 輸入驗證碼 print 'Please enter captcha: ' captcha = raw_input() yield scrapy.FormRequest( url = self.login_url, headers = self.headers, formdata = { 'email': UsersConfig['email'], 'password': UsersConfig['password'], '_xsrf': response.meta['_xsrf'], 'remember_me': 'true', 'captcha': captcha }, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'] }, callback = self.request_zhihu ) def request_zhihu(self, response): yield scrapy.Request( url = self.user_url + '/about', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_item, dont_filter = True ) yield scrapy.Request( url = self.user_url + '/followees', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) yield scrapy.Request( url = self.user_url + '/followers', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) def user_start(self, response): sel_root = response.xpath('//h2[@class="zm-list-content-title"]') # 判斷關注列表是否爲空 if len(sel_root): for sel in sel_root: people_url = sel.xpath('a/@href').extract()[0] yield scrapy.Request( url = people_url + '/about', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_item, dont_filter = True ) yield scrapy.Request( url = people_url + '/followees', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) yield scrapy.Request( url = people_url + '/followers', headers = self.headers, meta = { 'proxy': UsersConfig['proxy'], 'cookiejar': response.meta['cookiejar'], 'from': { 'sign': 'else', 'data': {} } }, callback = self.user_start, dont_filter = True ) def user_item(self, response): def value(list): return list[0] if len(list) else '' sel = response.xpath('//div[@class="zm-profile-header ProfileCard"]') item = UserItem() item['url'] = response.url[:-6] item['name'] = sel.xpath('//a[@class="name"]/text()').extract()[0].encode('utf-8') item['bio'] = value(sel.xpath('//span[@class="bio"]/@title').extract()).encode('utf-8') item['location'] = value(sel.xpath('//span[contains(@class, "location")]/@title').extract()).encode('utf-8') item['business'] = value(sel.xpath('//span[contains(@class, "business")]/@title').extract()).encode('utf-8') item['gender'] = 0 if sel.xpath('//i[contains(@class, "icon-profile-female")]') else 1 item['avatar'] = value(sel.xpath('//img[@class="Avatar Avatar--l"]/@src').extract()) item['education'] = value(sel.xpath('//span[contains(@class, "education")]/@title').extract()).encode('utf-8') item['major'] = value(sel.xpath('//span[contains(@class, "education-extra")]/@title').extract()).encode('utf-8') item['employment'] = value(sel.xpath('//span[contains(@class, "employment")]/@title').extract()).encode('utf-8') item['position'] = value(sel.xpath('//span[contains(@class, "position")]/@title').extract()).encode('utf-8') item['content'] = value(sel.xpath('//span[@class="content"]/text()').extract()).strip().encode('utf-8') item['ask'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[2]/span[@class="num"]/text()').extract()[0]) item['answer'] = int(sel.xpath('//div[contains(@class, "profile-navbar")]/a[3]/span[@class="num"]/text()').extract()[0]) item['agree'] = int(sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract()[0]) item['thanks'] = int(sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract()[0]) yield item
在myspider目錄下新建myconfig.py,並添加如下內容,將你的配置信息填入相應位置
# -*- coding: utf-8 -*- UsersConfig = { # 代理 'proxy': '', # 知乎用戶名和密碼 'email': 'your email', 'password': 'your password', } DbConfig = { # db config 'user': 'db user', 'passwd': 'db password', 'db': 'db name', 'host': 'db host', }
# -*- coding: utf-8 -*- import scrapy class UserItem(scrapy.Item): # define the fields for your item here like: url = scrapy.Field() name = scrapy.Field() bio = scrapy.Field() location = scrapy.Field() business = scrapy.Field() gender = scrapy.Field() avatar = scrapy.Field() education = scrapy.Field() major = scrapy.Field() employment = scrapy.Field() position = scrapy.Field() content = scrapy.Field() ask = scrapy.Field() answer = scrapy.Field() agree = scrapy.Field() thanks = scrapy.Field()
修改pipelines.py
# -*- coding: utf-8 -*- import MySQLdb import datetime from zhihu.myconfig import DbConfig class UserPipeline(object): def __init__(self): self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True) self.cursor = self.conn.cursor() # 清空表 # self.cursor.execute('truncate table weather;') # self.conn.commit() def process_item(self, item, spider): curTime = datetime.datetime.now() try: self.cursor.execute( """INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( item['url'], item['name'], item['bio'], item['location'], item['business'], item['gender'], item['avatar'], item['education'], item['major'], item['employment'], item['position'], item['content'], item['ask'], item['answer'], item['agree'], item['thanks'], curTime ) ) self.conn.commit() except MySQLdb.Error, e: print 'Error %d %s' % (e.args[0], e.args[1]) return item
找到 ITEM_PIPELINES
,改成:
ITEM_PIPELINES = { 'myspider.pipelines.UserPipeline': 300, }
在末尾添加,設置爬蟲的深度
DEPTH_LIMIT=10
確保MySQL已經打開,在項目根目錄下打開終端,
執行 scrapy crawl users -a url=https://www.zhihu.com/people/<user>
,
其中user爲爬蟲的第一個用戶,以後會根據該用戶關注的人和被關注的人進行爬取數據
接下來會下載驗證碼圖片,若未自動打開,請到根目錄下打開 captcha.gif,在終端輸入驗證碼
數據爬取Loading...
源碼能夠在這裏找到 github