# *-* coding:utf-8 *-* __author__ = 'YS' import urllib2 import urllib import re import json import os import time #抓取淘女郎的圖片,淘女郎地址:https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.22495f9f1lYEAb class MMSpider: def __init__(self, timeout=3, albumLimit=200, picLimit=500, sleepPicCount=100, savePath='pythonspider/'): self.__headers = { 'User-Agent':'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' } #抓取時間超時設置 self.timeout = timeout #抓取的相冊個數限制 self.albumLimit = albumLimit #獲取MM列表的地址 self.__mmListUrl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' #獲取相冊列表的地址 self.__albumListUrl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=:userId&page=:page' #獲取相冊具體相片的地址 self.__albumDetailUrl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=:userId&album_id=:albumId&page=:page' #MM詳情頁面地址 self.__personUrl = 'https://mm.taobao.com/self/aiShow.htm?userId=:userId' #抓取的文件存放路徑 self.savePath = savePath #每一個MM的照片最多抓多少張 self.picLimit = picLimit #抓取多少張圖片時休息1秒 self.sleepPicCount = sleepPicCount self.__mkdir(self.savePath) #獲取頁面內容,python中的異常繼承關係: https://docs.python.org/3/library/exceptions.html#exception-hierarchy def __getContents(self, url, data=None, encoding=None, isjson=None): try: request = urllib2.Request(url, data, self.__headers) response = urllib2.urlopen(request, timeout=self.timeout) if encoding: contents = response.read().decode(encoding).encode('utf-8') else: contents = response.read() return json.loads(contents,encoding='utf-8') if isjson else contents except urllib2.URLError,e: print '出錯了' + e.reason return None except BaseException,e: print '其餘錯誤' print e.args return None #獲取MM列表 def __getMMList(self, pageIndex): url = self.__mmListUrl data = urllib.urlencode({ 'currentPage':pageIndex, 'pageSize':50 }) list = self.__getContents(url, data, encoding='gbk', isjson=True) if list is None: return None elif list['status'] != 1: return None return list['data']['searchDOList'] #獲取相冊列表 def __getAlbumList(self, mm): albumList = [] baseUrl = self.__albumListUrl.replace(':userId',str(mm['userId'])) indexUrl = baseUrl.replace(':page','1') pageCount = int(self.__getAlbumListPage(indexUrl)) pageCount = pageCount if pageCount<=self.albumLimit else self.albumLimit for i in range(1, pageCount+1): listUrl = baseUrl.replace(':page', str(i)) contents = self.__getContents(listUrl) if (contents is None): continue pattern = re.compile('<h4><a href=".*?album_id=(.*?)&album_flag', re.S) items = re.findall(pattern, contents) for item in items: albumList.append(item) return albumList #獲取單個相冊的相片 def __getPicList(self, album, mm): lists = [] baseUrl = self.__albumDetailUrl.replace(':userId', str(mm['userId'])).replace(':albumId',str(album)) indexUrl = baseUrl.replace(':page','1') totalPage = self.__getPicPage(indexUrl) if totalPage is None: return None pages = range(1, int(totalPage)+1) for page in pages: url = baseUrl.replace(':page', '1') res = self.__getContents(url, isjson=True) if res is not None and res['isError']=='0': for pic in res['picList']: lists.append('http:'+pic['picUrl']) else: print "獲取結果失敗,地址:"+url return lists #獲取單個相冊照片列表的總頁數 def __getPicPage(self, indexUrl): albuminfo = self.__getContents(indexUrl, encoding='gbk', isjson=True) if albuminfo is None: print '獲取相冊照片失敗0,照片地址:'+indexUrl return None if albuminfo['isError'] != '0': print '獲取相冊照片失敗1,照片地址:'+indexUrl return None totalPage = int(albuminfo['totalPage']) return totalPage #下載保存單個相冊的照片,album表示相冊id def __savePics(self, album, mm): print "正在保存"+mm['realName'].encode('utf-8')+'的相冊,相冊id爲:'+album.encode('utf-8') pics = self.__getPicList(album, mm) if pics is None: return index = 1 for pic in pics: print "正在保存"+mm['realName'].encode('utf-8')+'的相冊,相片地址爲:'+pic.encode('utf-8') if index % self.sleepPicCount == 0: print "休息一秒" time.sleep(1) if index >= self.picLimit: print mm["realName"].encode('utf-8') + ":已經保存"+str(self.picLimit)+"張辣" return saveDir = self.savePath + mm['realName'].encode('utf-8') + '/img' self.__mkdir(saveDir) fileName = saveDir + '/'+str(index)+'.jpg' self.__saveImg(pic, fileName) index +=1 #獲取相冊的總頁數 def __getAlbumListPage(self, url): contents = self.__getContents(url) if contents: pattern = re.compile('id="J_Totalpage" value="(.*?)"', re.S) return re.search(pattern, contents).group(1) else: return None #保存MM的基本信息至本地的text文件夾 def __saveMM(self, mm): print '正在保存'+mm['realName'].encode('utf-8')+'的信息' saveDir = self.savePath + mm['realName'] + '/text' self.__mkdir(saveDir) fileName = saveDir + '/info.txt' personUrl = self.__personUrl.replace(':userId', str(mm['userId'])) contents = "姓名:%s\n城市:%s\n體重:%s\n身高:%s\n喜歡:%s\n我的主頁:%s\n"%(mm['realName'].encode('utf-8'),mm['city'].encode('utf-8'),str(mm['weight']).encode('utf-8'),str(mm['height']).encode('utf-8'),str(mm['totalFavorNum']).encode('utf-8'),personUrl.encode('utf-8')) self.__saveTxtFile(contents, fileName) ##保存MM的頭像到本地img文件夾 def __saveMMAvatar(self, mm): print '正在保存'+mm['realName'].encode('utf-8')+'的頭像' saveDir = self.savePath + mm['realName'] + '/img' self.__mkdir(saveDir) fileName = saveDir + '/avatar.jpg' imgUrl = 'http:'+mm['avatarUrl']+'_240x240xz.jpg' #獲取小圖 self.__saveImg(imgUrl, fileName) #寫入文本文件 def __saveTxtFile(self, contents, fileName): handler = open(fileName, 'w') handler.write(contents) handler.close() #寫入圖片 def __saveImg(self, imgUrl, fileName): contents = self.__getContents(imgUrl) if contents: handler = open(fileName, 'wb') handler.write(contents) handler.close() else: print '獲取圖片失敗,圖片地址:'+imgUrl.encode('utf-8') #建立存放圖片或者文本文件的文件夾 def __mkdir(self, saveDir): if os.path.exists(saveDir): return False else: os.makedirs(saveDir) return True #主入口方法 def start(self, startPage, endPage): pages = range(startPage, endPage+1) for i in pages: mmlist = self.__getMMList(i) if not mmlist: print "第%s頁無數據\n"%(str(i)) break for mm in mmlist: self.__saveMM(mm) self.__saveMMAvatar(mm) albumList = self.__getAlbumList(mm) for album in albumList: self.__savePics(album, mm) if __name__ == '__main__': mmspider = MMSpider() mmspider.start(2, 3)
效果:html
保存的圖片:python
保存的文本內容:git
源碼連接:https://github.com/yunshu2009/pythonspiders/blob/master/Taobaomm/Taobaomm.py github