淘女郎相冊爬蟲(Python編寫)

# *-* coding:utf-8 *-*

__author__ = 'YS'

import urllib2
import urllib
import re
import json
import os
import time

#抓取淘女郎的圖片,淘女郎地址:https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.22495f9f1lYEAb
class MMSpider:
    def __init__(self, timeout=3, albumLimit=200, picLimit=500, sleepPicCount=100, savePath='pythonspider/'):
        self.__headers = {
            'User-Agent':'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
        }
        #抓取時間超時設置
        self.timeout = timeout
        #抓取的相冊個數限制
        self.albumLimit = albumLimit
        #獲取MM列表的地址
        self.__mmListUrl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
        #獲取相冊列表的地址
        self.__albumListUrl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=:userId&page=:page'
        #獲取相冊具體相片的地址
        self.__albumDetailUrl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=:userId&album_id=:albumId&page=:page'
        #MM詳情頁面地址
        self.__personUrl = 'https://mm.taobao.com/self/aiShow.htm?userId=:userId'
        #抓取的文件存放路徑
        self.savePath = savePath
        #每一個MM的照片最多抓多少張
        self.picLimit = picLimit
        #抓取多少張圖片時休息1秒
        self.sleepPicCount = sleepPicCount

        self.__mkdir(self.savePath)

    #獲取頁面內容,python中的異常繼承關係: https://docs.python.org/3/library/exceptions.html#exception-hierarchy
    def __getContents(self, url, data=None, encoding=None, isjson=None):
        try:
            request = urllib2.Request(url, data, self.__headers)
            response = urllib2.urlopen(request, timeout=self.timeout)
            if encoding:
                contents =  response.read().decode(encoding).encode('utf-8')
            else:
                contents = response.read()

            return json.loads(contents,encoding='utf-8') if isjson else contents
        except urllib2.URLError,e:
            print '出錯了' + e.reason
            return None
        except BaseException,e:
            print '其餘錯誤'
            print e.args
            return None

    #獲取MM列表
    def __getMMList(self, pageIndex):
        url = self.__mmListUrl
        data = urllib.urlencode({
            'currentPage':pageIndex,
            'pageSize':50
        })
        list = self.__getContents(url, data, encoding='gbk', isjson=True)

        if list is None:
            return None
        elif list['status'] != 1:
            return None
        return list['data']['searchDOList']

    #獲取相冊列表
    def __getAlbumList(self, mm):
        albumList = []
        baseUrl = self.__albumListUrl.replace(':userId',str(mm['userId']))
        indexUrl = baseUrl.replace(':page','1')
        pageCount = int(self.__getAlbumListPage(indexUrl))
        pageCount = pageCount if pageCount<=self.albumLimit else self.albumLimit
        
        for i in range(1, pageCount+1):
            listUrl = baseUrl.replace(':page', str(i))
            contents = self.__getContents(listUrl)

            if (contents is None):
                continue
            pattern = re.compile('<h4><a href=".*?album_id=(.*?)&album_flag', re.S)
            items = re.findall(pattern, contents)
        
            for item in items:
                albumList.append(item)

        return albumList

    #獲取單個相冊的相片
    def __getPicList(self, album, mm):
        lists = []
        baseUrl = self.__albumDetailUrl.replace(':userId', str(mm['userId'])).replace(':albumId',str(album))
        indexUrl = baseUrl.replace(':page','1')
        totalPage = self.__getPicPage(indexUrl)
        if totalPage is None:
            return None
        pages = range(1, int(totalPage)+1)
        for page in pages:
            url = baseUrl.replace(':page', '1')
            res = self.__getContents(url, isjson=True)

            if res is not None and res['isError']=='0':
                for pic in res['picList']:
                    lists.append('http:'+pic['picUrl'])
            else:
                print "獲取結果失敗,地址:"+url

        return lists

    #獲取單個相冊照片列表的總頁數
    def __getPicPage(self, indexUrl):
        albuminfo = self.__getContents(indexUrl, encoding='gbk', isjson=True)
        if albuminfo is None:
            print '獲取相冊照片失敗0,照片地址:'+indexUrl
            return None

        if albuminfo['isError'] != '0':
            print '獲取相冊照片失敗1,照片地址:'+indexUrl
            return None
        totalPage = int(albuminfo['totalPage'])

        return totalPage

    #下載保存單個相冊的照片,album表示相冊id
    def __savePics(self, album, mm):
        print "正在保存"+mm['realName'].encode('utf-8')+'的相冊,相冊id爲:'+album.encode('utf-8')
        pics =  self.__getPicList(album, mm)
        if pics is None:
            return
        index = 1
        for pic in pics:
            print "正在保存"+mm['realName'].encode('utf-8')+'的相冊,相片地址爲:'+pic.encode('utf-8')
            if index % self.sleepPicCount == 0:
                print "休息一秒"
                time.sleep(1)
            if index >= self.picLimit:
                print mm["realName"].encode('utf-8') + ":已經保存"+str(self.picLimit)+"張辣"
                return
                
            saveDir = self.savePath + mm['realName'].encode('utf-8') + '/img'
            self.__mkdir(saveDir)
            fileName = saveDir + '/'+str(index)+'.jpg'
            self.__saveImg(pic, fileName)

            index +=1

    #獲取相冊的總頁數
    def __getAlbumListPage(self, url):
        contents = self.__getContents(url)
        if contents:
            pattern = re.compile('id="J_Totalpage" value="(.*?)"', re.S)
            return re.search(pattern, contents).group(1)
        else:
            return None

    #保存MM的基本信息至本地的text文件夾
    def __saveMM(self, mm):
        print '正在保存'+mm['realName'].encode('utf-8')+'的信息'
        saveDir = self.savePath + mm['realName'] + '/text'
        self.__mkdir(saveDir)
        fileName = saveDir + '/info.txt'
        personUrl = self.__personUrl.replace(':userId', str(mm['userId']))
        contents = "姓名:%s\n城市:%s\n體重:%s\n身高:%s\n喜歡:%s\n我的主頁:%s\n"%(mm['realName'].encode('utf-8'),mm['city'].encode('utf-8'),str(mm['weight']).encode('utf-8'),str(mm['height']).encode('utf-8'),str(mm['totalFavorNum']).encode('utf-8'),personUrl.encode('utf-8'))
        self.__saveTxtFile(contents, fileName)

    ##保存MM的頭像到本地img文件夾
    def __saveMMAvatar(self, mm):
        print '正在保存'+mm['realName'].encode('utf-8')+'的頭像'        
        saveDir = self.savePath + mm['realName'] + '/img'
        self.__mkdir(saveDir)
        fileName = saveDir + '/avatar.jpg'
        imgUrl = 'http:'+mm['avatarUrl']+'_240x240xz.jpg'  #獲取小圖
        self.__saveImg(imgUrl, fileName)

    #寫入文本文件
    def __saveTxtFile(self, contents, fileName):
        handler = open(fileName, 'w')
        handler.write(contents)
        handler.close()

    #寫入圖片
    def __saveImg(self, imgUrl, fileName):
        contents = self.__getContents(imgUrl)
        if contents:
            handler = open(fileName, 'wb')
            handler.write(contents)
            handler.close()
        else:
            print '獲取圖片失敗,圖片地址:'+imgUrl.encode('utf-8')

    #建立存放圖片或者文本文件的文件夾
    def __mkdir(self, saveDir):
        if os.path.exists(saveDir):
            return False
        else:
            os.makedirs(saveDir)
            return True

    #主入口方法
    def start(self, startPage, endPage):
        pages = range(startPage, endPage+1)
        for i in pages:
            mmlist = self.__getMMList(i)

            if not mmlist:
                print "第%s頁無數據\n"%(str(i))
                break
            for mm in mmlist:
                self.__saveMM(mm)
                self.__saveMMAvatar(mm)
                albumList = self.__getAlbumList(mm)

                for album in albumList:
                    self.__savePics(album, mm)

if  __name__ == '__main__':
    mmspider = MMSpider()
    mmspider.start(2, 3)

 效果:html

保存的圖片:python

 保存的文本內容:git

 

源碼連接:https://github.com/yunshu2009/pythonspiders/blob/master/Taobaomm/Taobaomm.py github

相關文章
相關標籤/搜索