在進行老司機開車之一時(即微信開車公衆號)無心中發現這樣一個福利妹紙圖地址,妹紙圖都還挺不錯的,可是每次翻頁的時候各類彈出式廣告,很是煩,因而想着把圖片都down下來,而後放到路由器的samb服務上,這樣能夠用手機輕鬆訪問了!!html
效果圖:web
先上源碼:注意修改下載路徑瀏覽器
#-*- coding:utf-8 -*- import urllib import urllib2 import re import os import requests from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf-8') class BeatifulGirl: #初始化基本連接地址以及下載路徑地址 def __init__(self): self.baseurl = 'http://zhaofuli.xyz/luyilu/' self.basepath = '/Volumes/Python/' #獲取目錄內容,讀取連接 def get_content(self,url): res = requests.get(url) html = BeautifulSoup(res.text,'lxml') content = html.find_all("a",class_="thumbnail") #查詢對應的標籤並把href值加入到list裏 content_list = [] for i in content: content_list.append(i.get('href')) return content_list #讀取目錄連接裏的圖片 def get_img(self,url): #構造頭文件,獲取全部圖片地址 headers = { #'Host': 'images.126176.com:8818', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'DNT': '1', 'Referer':'http://zhaofuli.xyz/luyilu/2016/1031/2571.html', #'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', #'If-Modified-Since': 'Mon, 31 Oct 2016 11:43:36 GMT' } data = None #獲取html編碼格式爲gbk html = urllib.urlopen(url).read().decode('gbk') #獲取圖片地址 imgre = re.compile(r'<p>.*?<img src="(.*?)"') imglist = re.findall(imgre, html) #獲取文件夾名稱 pathre = re.findall(re.compile(r'<h1 class="article-title">(.*?)</h1>'),html) path = pathre[0] #判斷title裏是否包含括號,若包含則去掉括號,將同一主題的圖片放在一個目錄中 if path.find('(') != -1: pathname = self.basepath+path[:-3] else: pathname = self.basepath+path print pathname self.mkdir(pathname) x=0 for imgurl in imglist: print imgurl n = imgurl.rfind('-') jpg_name = imgurl[n+1:].encode('utf8') #使用header模擬瀏覽器登錄,避免重定向 req = urllib2.Request(imgurl,None,headers) rt = urllib2.urlopen(req) fw = open(pathname+'/%s'%jpg_name,'wb') fw.write(rt.read()) fw.close() x += 1 print("正在下載第%s張圖片"%x) def get_page(self,url): #這裏因爲頁面上沒法獲取總頁數,因而採用固定頁面遍歷,不過通常不會超過20頁,因此夠了 num = 1 for num in range(1,20): #循環讀取頁面 print("正在讀取頁面:" + url) self.get_img(url) print("第%s頁讀取完成"%num) #獲取下頁地址 res = requests.get(url) html = BeautifulSoup(res.text,'lxml') next_page = html.find("li",class_="next-page") if next_page: for i in next_page: next_url = i.get('href') #判斷下頁地址是否存在,若存在則拼接出新的url,若不存在則跳出循環 #截取當前url,拼接獲取到的下頁地址 n = url.rfind('/') url = url[:n+1]+next_url else: break def mkdir(self,path): path = path.strip() #判斷路徑是否存在 isExist = os.path.exists(path) print path if not isExist: os.mkdir(path) return True else: print u'目錄已存在' return False def main(self): #構造首頁連接地址 num = 1 #這裏只爬取前2頁數據,可根據須要修改,圖片較多,建議每次讀取3頁之內 for num in range(1,2): url_list = 'http://zhaofuli.xyz/luyilu/list_5_%s.html'%num content_list = self.get_content(url_list) print("加載目錄第%s頁。。。"%num) #讀取首頁中各連接內容,調用get_page獲取全部頁面內容 i = 0 for url in content_list: url = content_list[i] print url page_url = 'http://zhaofuli.xyz' + url self.get_page(page_url) i+=1 num+=1 test = BeatifulGirl() test.main()