抓取蝦米歌曲信息

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 01 18:09:21 2014

@author: omom
"""
import urllib2
from bs4 import BeautifulSoup

src="http://www.xiami.com/widget/40537093_376239,185689,1769863609,3351090,1769863610,3562321,183942,1769736296,3418502,1770127750,1770201852,3351083,3351088,3351082,1769496545,1769496547,1769496546,3418497,_235_346_FF8719_494949_0/multiPlayer.swf"
#src="http://www.xiami.com/widget/40537093_1771331004,1771331002,55553,3478385,1769187978,380807,3478389,1770464110,55552,1771331001,380865,3478386,380834,380869,55670,55823,1772165872,55549,1769187987,380818,_235_346_FF8719_494949_0/multiPlayer.swf"
#src="http://www.xiami.com/widget/40537093_380863,380832,55550,380830,380837,380861,380799,380866,380808,1770464109,55559,380860,1771512727,3478391,1771331023,55711,55556,380797,1769074612,380788,380810,3478387,380852,55705,55865,1769187981,380787,380862,1770464107,1771360882,55700,1770464108,55869,55867,3478388,380835,1769187983,3364419,1769115993,1771331005,_235_346_FF8719_494949_0/multiPlayer.swf"

a,b,c=src.split("_",2)
b=b.rstrip(",")
ids=b.split(",")

music_base="http://www.xiami.com/song/playlist/id/%s/object_name/default/object_id"

def decrypt_url(s):
    s=s.replace('^','0')
    src=list(s)
    rows_count=int(src.pop(0))
    dst_list=[]    
    dst=[]
    src_len=len(src)
    row_len,reminder=divmod(src_len,rows_count)

    for i in range(rows_count):
        dst_list.append([])

    
    start=stop=0
    for row_list in dst_list:
        stop=start+row_len
        if reminder>0:
            stop+=1
            reminder-=1
    
        row_list.extend(src[start:stop])
        start=stop


    while 1:
        try:
            for row in dst_list:
                dst.append(row.pop(0))
        except IndexError:
            break
    url= ''.join(dst)        
    return urllib2.unquote(url).replace('^','0')
    

import time
def  collect(mid=376239,high_quality=False):
    req=music_base%str(mid)

    req=urllib2.Request(req)
    req.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
    req.add_header("Accept-Language","zh-CN,zh;q=0.8")
    req.add_header("Cache-Control","no-cache")
    req.add_header("Connection","close")
    req.add_header("Pragma","no-cache")
    req.add_header("Referer","http://www.baidu.com/")
    
    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36")
    req.add_header("cookie","_unsign_token=d0e87c7230b44e116e3b8e96c48c9b62; __gads=ID=3e9c72b9e0b3e7ba:T=1407824092:S=ALNI_MYxedT1iMAiA-IXbcgEu4Ss_XiRaw; box_opened=1; bdshare_firstime=1409207591670; __utma=251084815.350459004.1409209135.1409209135.1409209135.1; __utmz=251084815.1409209135.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); member_auth=2WydGIYavmhn16fDTt9ldyYb5%2BHTT2eFyY9Yjb4ovwQnIooIY9H%2Bx6uVQg5L3yCaq2HKtwNJXYSZg3aFgGLx8Kg; user=40534293%22%E5%93%8E%E5%B0%8F%E7%AC%A8%E8%9B%8Ba%22%220%221%22%3Ca+href%3D%27%2Fwebsitehelp%23help9_3%27+%3Edo%3C%2Fa%3E%220%220%220%22ee71485989%221409217771; ahtena_is_show=false; recent_tags=%E6%BF%80%E6%83%85+%E7%97%9B%E8%8B%A6+%E4%BC%A4%E5%BF%83+%E5%BF%A7%E4%BC%A4+; user_from=1; t_sign_auth=0; __guestplay=MTc3MjEzMDMyMywxOzE3NzE0MTkwNTQsMjsxNzY5OTI0MjQ0LDE%3D; pnm_cku822=187n%2BqZ9mgNqgJnCG0WtMC8x7vAtsC0zrXQcNA%3D%7CnOiH84T3i%2FOL%2F4zwi%2FyG9VU%3D%7CneiHGXz6UeRW5k4rRCFXLkskQdt3xmHTad%2B6Gro%3D%7Cmu6b9JHlkuGd5Z3pmuad6pDjnu2c65%2Fkneef5JjhluyX7JjhmuCFJQ%3D%3D%7Cm%2B%2BT%2FGIXeAx4D2AUbwBl1mcbhfZW1n3Fv8F03GvTZte00XHR%7CmO6BH2wDdg11Gm4bbht0B2gcYBVmCX0OdQZpHWEUZwh8D3gDowM%3D%7Cme6d7oHyneiH84Twn%2BmR64TzUw%3D%3D; CNZZDATA921624=cnzz_eid%3D6125411959-1407824089-%26ntime%3D1409550640; CNZZDATA2629111=cnzz_eid%3D1781743730-1407814089-%26ntime%3D1409450640; _xiamitoken=7cec7fe673a3672812c4b714a31d6687; isg=67257CF91c74F3297A603C00A816D262; sec=5401410089735bee8e0075e0b6825e0ba6a0a485")

    c=0
    while 1:
        try:
            page=urllib2.urlopen(req)
            time.sleep(0.5)
            break
        except urllib2.HTTPError:
                c+=1
                if c==5:
                    print 'id is:',mid
                    return {}


    dom=BeautifulSoup(page.read(),features="xml")
    try:
        title=dom.find("title").text
        song_id=dom.find("song_id").text
        url=dom.find("location").text
        url=decrypt_url(url)
        lyric=dom.find("lyric").text
        background=dom.find("background").text
        
        album_id=dom.find("album_id").text
        album_pic_s=dom.find("pic").text
        album_pic=dom.find("album_pic").text
        album_name=dom.find("album_name").text    
        
        artist_id=dom.find("artist_id").text
        artist=dom.find("artist").text
    except AttributeError:
        print 'id is:',id
    print title
    if high_quality:
        url=url.split("?auth_key")[0][::-1].replace("l_","h_")[::-1]
    return {"title":title,"song_id":song_id,"url":url,
            "lyric":lyric,"background":background,
            "album_id":album_id,"album_pic_s":album_pic_s,
            "album_pic":album_pic,"album_name":album_name,
            "artist_id":artist_id,"artist":artist,
            "xiami":True,            
            }
    
    
    
#from  pprint import pprint 
#pprint(collect())

def split_var(s):
    
    s_list=s.split("\n")
    dst=""
    for i in s_list:
        line=i.strip()
        if line=="":
            continue
        var=line.split("=")[0]
        
        dst+='"'+var+'":'+var+','
     
    dst="{"+dst+"}"
    print ''    
    print dst
    print ''
    dst=""
    for i in s_list:
        line=i.strip()
        if line=="":
            continue
        var=line.split("=")[0]
        dst+=var+"=i[\""+var+"\"]"+"\n"
    print dst
    print ""
    print 
    
    
s='''
    title=dom.find("title").text
    song_id=dom.find("song_id").text
    url=decrypt_url(url)
    lyric=dom.find("lyric").text
    background=dom.find("background").text
    
    album_id=dom.find("album_id").text
    album_pic_s=dom.find("pic").text
    album_pic=dom.find("album_pic").text
    album_name=dom.find("album_name").text    
    
    artist_id=dom.find("artist_id").text
    artist=dom.find("artist").text
      
    '''


##已失效。必須實時獲取
split_var(s)
from pprint import pprint

dst=[]
for i in ids:
    data=collect(i)
    if data:
        dst.append(data)
    
import json,urllib
data=json.dumps(dst)

dst={"data":data}

#print urllib.urlopen("http://localhost/music/upload",data=urllib.urlencode(dst)).read()


    
    
相關文章
相關標籤/搜索