Python 網絡爬蟲源碼(抓取源視頻)

樣本python

網絡來源linux

做者: hehao網絡

原文抓取linux520網站的***測試視頻,無心侵犯linux520網站權益。ide

 

源碼分享學習學習

#!/usr/bin/env python測試

# -*- coding: UTF-8 -*-網站

#version 0.1url

#author:hehao視頻

#python version:2.7.2xml

#須要安裝psutil庫

#

from os.path import basename

from urlparse import urlsplit

import os

import urllib2

import sys

try:

import psutil

except:

print "please install psutil ex:pypm install psutil"

sys.exit()

import re

def url2name(url):

return basename(urlsplit(url)[2])

#下載指定的文件

def download(url, localFileName = None):

localName = url2name(url)

req = urllib2.Request(url)

r = urllib2.urlopen(req)

if r.info().has_key('Content-Disposition'):

# If the response has Content-Disposition, we take file name from it

localName = r.info()['Content-Disposition'].split('filename=')[1]

if localName[0] == '"' or localName[0] == "'":

localName = localName[1:-1]

elif r.url != url:

# if we were redirected, the real file name we take from the final URL

localName = url2name(r.url)

if localFileName:

# we can force to save the file as specified name

localName = localFileName

f = open(localName, 'wb')

f.write(r.read())

f.close()

#經過進程名獲取進程ID

def getpid(process_name):

p_list=psutil.get_process_list()

for x in p_list:

if process_name in str(x):

return x.pid

else:

return 0

#殺死指定進程ID

def killpid(pid):

p_kill=psutil.Process(pid)

try:

p_kill.kill()

except:

return 0

#使用swfdump對flash文件進行分析,並提取真實的文件名

def analy_swf(swf_path):

a=os.popen(r"swfdump.exe -a "+swf_path)

for x in a:

real_name=re.findall(r"""<uri>([\S\s]*?)</uri>""",x)

if len(real_name)>0:

return real_name[0]

killpid(getpid("swfdump.exe"))

#下載真實的視頻文件

def download_realvideo(swf_url,url,id):

download(swf_url,'tmp.swf')

r_name=analy_swf("tmp.swf")

download(url+r_name,str(id)+"_"+r_name)

os.remove('tmp.swf')

#url="#該url不公開(視頻地址)"

#u="該url不公開(地址)"

#download_realvideo(url,u,138)

url="該url不公開(地址)"

for x in range(1,200):

u=url+str(x)+"/"

print u

try:

a=urllib2.urlopen(urllib2.Request(u)).read()

except:

continue

try:

s=re.findall(r"""<param name\=\"src\"\svalue\=\"(.*?)\"\/>""",a)[0] #正則提取swf

except:

continue

if '#' in s:

try:

real_name=re.findall(r"""<uri>([\S\s]*?)</uri>""",urllib2.urlopen(urllib2.Request(u+s.split('_')[0]+'_config.xml')).read())

download(u+real_name[0],str(x)+'_'+real_name[0])

except:

continue

elif 'swf' in s:

try:

download_realvideo(u+s,u,x)

except:

continue

else:

try:

download(u+s,str(x)+'_'+s)

except:

continue

相關文章
相關標籤/搜索