import urllib.request as ur import urllib.parse as up import re import easygui as e import socket from bs4 import BeautifulSoup def find_download(h_soup,where,count): #下面的三條語句是爲了分類,過濾取番號。每一個網站的特色都不同,要本身找規律。 H = h_soup.find("h1",class_ = "article-title").a.get_text() if H[0] != "【" : try :#有些番號未知 str1 = h_soup.find("h2").span.span.get_text() except : str1 = "404notfind%d" elif "【廣告招租】" in H: return count else : str1 = '' for i in H: str1 += i if i == "】": break timeout = 10#設置下載被容許的最大時間 for l in h_soup.find_all("img",class_ = re.compile("align.+"),src = re.compile(".[a-zA-Z]{3,4}$")): #下面的if是過濾四個垃圾圖片 if l["src"] in ["http://ww1.sinaimg.cn/large/e75a115bgw1ezy20gtzgpj20m80b4gol.jpg","http://ww2.sinaimg.cn/mw690/e75a115bgw1f8ambv7odog20a00327h9.gif","http://ww3.sinaimg.cn/mw690/e75a115bgw1f76bno2v7kj20a0032wew.jpg","http://ww2.sinaimg.cn/mw690/e75a115bgw1ezm9k3vracj20by0by0tk.jpg"]: continue url_fin = l["src"] for i in range(3):#網路或資源問題引起錯誤最多3次 try : request_fin = ur.Request(url_fin,headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 BIDUBrowser/6.x Safari/537.31'})#匿名 fin_img = ur.urlopen(request_fin,timeout = timeout).read() break except : pass else : continue file = open(where+"\"+"%s_%d.gif" % (str1,count),'wb') file.write(fin_img) file.close() print("已下載:"+"\n"+"%s_%d.gif" % (str1,count)) count += 1 return count def tryopen(req):#網路有錯誤最多5次 errorTimes = 0 while errorTimes != 10: try: errorTimes += 1 return ur.urlopen(req,timeout = 10).read().decode("utf-8") except: pass return None def main(): if e.buttonbox("Are you ready?","黃蟲",choices = ("of cause!","i'm Gay.")) == "of cause!": while 1: have = e.multenterbox("輸入你要的頁數,若是隻要一頁就填同樣的:","黃蟲",fields = ("起始頁","結束頁")) if have[0] != '' and have[1] != '': nice = int(have[0]) day = int(have[1]) if nice > 1000 or day > 1000: e.msgbox("紳士請注意身體!") continue break e.msgbox("serious?") where = e.diropenbox("你要保存到哪?") i = nice while 1:#分三層 url1 = "http://www.gifjia.com/neihan/page/%d/" % i request1 = ur.Request(url1,headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 BIDUBrowser/6.x Safari/537.31'})#匿名 html1 = tryopen(request1) h1_soup = BeautifulSoup(html1)#主頁 text = '&&!@#$#@' #爲了第一次能運行 word = 0#詞條數 for j in h1_soup.find_all("a",href = re.compile("[0-9]+/$")): if text in j["href"]: #爲了防止重複爬,他網站有的url後面多點東西可是表示的和當前頁面是同一個意思 continue word += 1 if word > 11:#詞條後面還有未知鏈接防止爬偏了。。11是由於前面還有一個廢鏈接,一共10個詞條。太不智能了。。。 break url2 = j["href"] text = url2 count = 0 #主頁上每個詞條裏的圖片編號 request2 = ur.Request(url2,headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 BIDUBrowser/6.x Safari/537.31'})#匿名 html2 = tryopen(request2) try : h2_soup = BeautifulSoup(html2)#第一層鏈接副頁 count = find_download(h2_soup,where,count) except: continue for k in h2_soup.find_all("a",href = re.compile(j["href"]+"[0-9]+/")): url3 = k["href"] if j["href"]+"1/" == k["href"]:#防重複爬第一頁 continue request3 = ur.Request(url3,headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 BIDUBrowser/6.x Safari/537.31'})#匿名 html3 = tryopen(request3) try : h3_soup = BeautifulSoup(html3)#副頁還分好多頁 count = find_download(h3_soup,where,count) except: pass if i >= day: break i += 1 else : e.msgbox("╭∩╮(︶︿︶)╭∩╮") if __name__ == '__main__': main()
只要弄清楚網站的結構,找到你要爬取的資源的共同點,合理的匿名和異常處理就基本沒有問題了。html