抓取QQ空間相冊

某天,想下載某人的相冊,發現一張一張下載,工做量巨大,因此寫了這個工具。html

使用到的工具

  • Fiddler(抓包工具)
  • python(腳本語言)
  • intellij

步驟

分析包

  • 獲取相冊分類連接信息

打開某人空間 - 打開fiddler抓取 - 訪問相冊
markpython

打開fiddler,搜索相冊名稱 ‘侄子’編程

mark

獲取相冊連接,將其拖到右邊的composerjson

mark

拷貝連接地址和cookie信息cookie

  • 獲取相冊裏面照片的信息

同上面的步驟,打開某個相冊,在fiddler裏面搜索某張照片的名稱app

mark

獲取相冊列表信息連接composer

編程

貼出主要代碼dom

# coding=utf-8
import os, threading

from Queue import Queue
from py2.http_utils import Http
from py2 import soup, cookie_utils, json_utils, regex_utils, DOWNLOAD_PATH

photo_url = 'http://h5.qzone.qq.com/proxy/domain/alist.photo.qq.com/fcgi-bin/fcg_list_album_v3?g_tk=1210057952&callback=shine0_Callback&t=419513462&hostUin=773817625&uin=77086540&appid=4&inCharset=utf-8&outCharset=utf-8&source=qzone&plat=qzone&format=jsonp&notice=0&filter=1&handset=4&pageNumModeSort=40&pageNumModeClass=15&needUserInfo=1&idcNum=0&callbackFun=shine0&_=1484721771570'
detail_url = 'http://h5.qzone.qq.com/proxy/domain/shplist.photo.qzone.qq.com/fcgi-bin/cgi_list_photo?g_tk=1210057952&callback=shine3_Callback&t=160656776&mode=0&idcNum=0&hostUin=773817625&topicId=5ceb20e9-d727-4285-b177-dc91df68e67b&noTopic=0&uin=77086540&pageStart=0&pageNum=30&skipCmtCount=0&singleurl=1&batchId=&notice=0&appid=4&inCharset=utf-8&outCharset=utf-8&source=qzone&plat=qzone&outstyle=json&format=jsonp&json_esc=1&question=&answer=&callbackFun=shine3&_=1484721812872'
cookie1 = " pac_uid=1_77086540; tvfe_boss_uuid=2c4e1161f404d6b1; QZ_FE_WEBP_SUPPORT=1; cpu_performance_v8=3; pgv_pvid=9548679930; o_cookie=77086540; __Q_w_s__QZN_TodoMsgCnt=1; __Q_w_s_hat_seed=1; RK=mBGHAsurXZ; randomSeed=23363; qq_photo_key=71b85dd259bd647f18a963405c43e803; ptisp=ctc; ptcz=eae4b39cd8587e96c54d1585575c875c51beb348f479e89c599533f6576cde1e; pt2gguin=o0077086540; uin=o0077086540; skey=@D4MK2Tg8t; p_uin=o0077086540; p_skey=wqJMnZAj7nEKuDrTMmATE7Cd0vBccQ*KloG*3aWgtF8_; pt4_token=VjVJc1wZjNTzVlXNOzKdDTXd*BdE2v7BEqs3ymF5G6E_; rv2=8066000527FC1F71AD31043B7CE2B89E62CC79CD9D7FEF3E87; property20=C112B65F9A5C0A4A00E4FE7DCC637E839780CB4B4072589CF28B5A90AD807919FE353ECC066D4A54; pgv_info=ssid=s7926413392; qzspeedup=sdch"

download_path = os.path.join(DOWNLOAD_PATH, 'csy')

topicId = regex_utils.find(r'topicId=(.*?)&', detail_url)
detail_url = detail_url.replace("&pageNum=30&", "&pageNum=500&")

http = Http()
cookie2 = cookie_utils.parse_fiddler_str_cookie(cookie1)
http.set_cookie(cookie2)


def getAlbumListModeSort():
    html1 = http.get(photo_url)
    if html1:
        t1 = html1[html1.find("(") + 1:html1.rfind(")")]
        j1 = json_utils.loads(t1)
        albumList = []
        if 'albumListModeSort' in j1['data']:
            albumList = j1['data']['albumListModeSort']
        elif 'albumList' in j1['data']:
            albumList = j1['data']['albumList']
        elif 'albumListModeClass' in j1['data']:
            for k in j1['data']['albumListModeClass']:
                if 'albumList' in k and k['albumList'] is not None:
                    albumList.extend(k['albumList'])
        for p in albumList:
            print p['name']
            print p['id']
            print p['priv']
            question = p['question'] if 'question' in p else None
            print question
            if question is None:
                getDetail(p['name'], p['id'])
                pass
            print '---------------------------'


downloadQueue = Queue()
downLoadIsStart = False


def getDetail(name, id):
    print 'get photo: %s, %s' % (name, id)
    detail_url1 = detail_url.replace(topicId, id)
    html1 = http.get(detail_url1)
    path1 = os.path.join(download_path, name)
    if html1:
        t1 = html1[html1.find("(") + 1:html1.rfind(")")]
        j1 = json_utils.loads(t1)
        if j1 is None:
            j1 = json_utils.loads(t1.decode("gbk", "ignore"))
        photoList = []
        if 'photoList' in j1['data']:
            photoList = j1['data']['photoList']
        elif 'rangeList' in j1['data']:
            if j1['data']['rangeList'] is not None:
                for k in j1['data']['rangeList']:
                    if 'photoList' in k and k['photoList'] is not None:
                        photoList.extend(k['photoList'])
        if photoList is not None and len(photoList) > 0:
            for p in photoList:
                print p['url']
                # http.download_file(p['url'], path=path1, file_name=p['name'])
                downloadQueue.put((p['url'], path1, str(p['modifytime']) + "_" + p['name']))
                downloadFile()


def downloadFile():
    global downLoadIsStart
    if not downLoadIsStart:
        downLoadIsStart = True
        all_thread = []
        for i in range(5):
            all_thread.append(threading.Thread(target=downloadFileThread))
        for t in all_thread:
            t.start()
        print 'start....'


def downloadFileThread():
    try:
        val = downloadQueue.get(block=True, timeout=5)
        while val is not None:
            try:
                http.download_file(val[0], path=val[1], file_name=val[2])
            except:
                pass
            val = downloadQueue.get(block=True, timeout=5)
    except:
        pass


if __name__ == "__main__":
    getAlbumListModeSort()
    # getDetail(u'葉落而知秋', u'V13N7D3U4c7iMA')

運行結果

mark

相關文章
相關標籤/搜索