純python新手寫的關於下載百度空間文章python源碼,代碼寫的很差,能用不能看。你們看看效果就行, 不要求代碼的精簡程度。大牛請飄過。html
下載百度空間文章python源碼使用方法:python
在cmd中輸入:> python "F:WalkboxPythonmyworkaidugetArticleId - r1.py" bspeng922 6post
命令格式:python 文件存放路徑 [用戶名] [下載頁數]學習
下載頁數能夠不填,不填則爲所有下載。若是大於實際總頁數,則會重複下載第一頁的內容測試
這段代碼只能是新版的百度空間,只測試了」低調優雅「模板,生成的是html文件;this
在這裏仍是要推薦下我本身建的Python開發學習羣:483546416,羣裏都是學Python開發的,若是你正在學習Python ,小編歡迎你加入,你們都是軟件開發黨,不按期分享乾貨(只有Python軟件開發相關的),包括我本身整理的一份2018最新的Python進階資料和高級開發教程,歡迎進階中和進想深刻Python的小夥伴url
同時我忽然發現一個奇特的功能,這段代碼居然能夠用來刷百度空間的訪問量,不錯哦。spa
下載百度空間文章python源碼,以下:命令行
# -*- coding: utf8 -*-htm
import urllib
import re,os,sys,time
def articleDownload(username,pageCount):
#判斷傳入的參數是否合法
if username == "" : username = "bspeng922"
if pageCount == "" or int(pageCount)<0 :
pageCount = 0
else:
pageCount = int(pageCount) + 1
print "Blog: http://hi.baidu.com/new/%s"%username
#文件保存目錄,可修改
saveDrive = "E:\test" #directory to save html files
#html文件保存目錄
if not os.path.exists(saveDrive) :
os.mkdir(saveDrive)
mydrive = os.path.join(saveDrive,username)
if not os.path.exists(mydrive) :
os.mkdir(mydrive)
#圖片保存目錄
imgDir = "img"
imgPath = os.path.join(saveDrive,username,imgDir)
if not os.path.exists(imgPath):
os.mkdir(imgPath)
#判斷傳入的頁數是否爲0,爲0則所有下載
if pageCount == 0 :
fstbaidu = urllib.urlopen("http://hi.baidu.com/new/%s"%username)
totalRecord,pagesize=0,0
for fstline in fstbaidu:
if fstline.find("allCount")>0: #only one tag
totalRecord = int(fstline[fstline.index("'")+1:fstline.rindex("'")])
if fstline.find("pageSize")>0:
pagesize = int(fstline[fstline.index("'")+1:fstline.rindex("'")])
if pagesize != 0 and totalRecord != 0:
pageCount = totalRecord/pagesize
if totalRecord / float(pagesize) > totalRecord/pagesize:
pageCount = pageCount + 2
fstbaidu.close()
print "Page Count: ",pageCount - 1
#根據文章ID得到文章實際連接
articleCount = 0
sumHtmlPath = os.path.join(saveDrive,"%s.html"%username)
sumfile = open(sumHtmlPath,"w") #the sum file
aTagCmp = re.compile("""<a href="/%s/item/([w]*?)" class="a-incontent a-title cs-contentblock-hoverlink" target=_blank>(.*?)</a>"""%username)
for page in range(1,pageCount):
thisPageUrl = urllib.urlopen("http://hi.baidu.com/new/%s?page=%d"%(username,page))
print "Page: ",page
for line in thisPageUrl:
if line.find("a-incontent a-title")>0 :
articleCount += 1 #博客文章數目
linefind = aTagCmp.findall(line)
#print linefind
for line in linefind :
#文章的ID和名稱
myurl = line[0]
mytitle = line[1]
sumfile.write("""<a href='%s\%s.html' target='blank'>%s</a><br>"""%(username,myurl,mytitle))
#得到真實的文章,並保存
thispath = os.path.join(mydrive,"%s.html"%myurl)
thisfile = open(thispath,'w')
thisArticle = urllib.urlopen("http://hi.baidu.com/%s/item/%s"%(username,myurl))
for thisline in thisArticle:
imgCount = 0
badImg = 0
if thisline.find("content-head clearfix")>0: #只取正文
#匹配圖片標籤
imgTagCmp = re.compile("""<img.*?src="(.*?)".*?>""")
imglist = imgTagCmp.findall(thisline)
for imglink in imglist :
imageNewPath = ""
#print imglink
if imglink.find("""://""")>0:
imageName = imglink[imglink.rindex("/")+1:]
#下載圖片
try:
urllib.urlretrieve(imglink,os.path.join(imgPath,imageName))
imgCount += 1
except : #不能下載則報錯
print "cannot download this image: "+imageName
#替換圖片連接
imageNewPath = """<img src="%s/%s" />"""%(imgDir,imageName)
thisImgCmp = re.compile("""<img width="d{1,4}" height="d{1,4}" src="http://.*?/%s" />|<img src="http://.*?/%s" small="0" />|<img src="http://.*?/%s" />|<img small="0" src="http://.*?/%s" />"""%(imageName,imageName,imageName,imageName))
#print imageNewPath
try:
#print thisImgCmp.findall(thisline)
thisline = thisImgCmp.sub(imageNewPath,thisline) #每次都對當前圖片標籤進行替換
#print thisline
except:
print "UnExpect error"
else:#www.iplaypy.com
badImg += 1
#刪除多餘的內容
pos = thisline.find("mod-post-info clearfix")
if pos>0 :
thisline = thisline[0:pos-12]
thisfile.write(thisline.strip())
thisfile.close()
thisArticle.close()
#print "Image Count: %d Bad Image: %d"%(imgCount, badImg)
thisPageUrl.close()
sumfile.close()
print "Article Count: ",articleCount
if __name__ == "__main__":
st = time.time()
#得到命令行參數
if len(sys.argv) == 2:
uname = sys.argv[1]
pages = 0
elif len(sys.argv)>2:
uname = sys.argv[1]
pages = int(sys.argv[2])+1
else:
uname = raw_input("Username -> ")
pages = raw_input("Page -> ")
articleDownload(uname,pages)
et = time.time()
print "Time used: %0.2fs"%(et-st)