喜歡鬥圖, 圖不夠? 爬了一下鬥圖網, 自動下載圖片, 鬥圖, 歷來沒怕過誰.python
環境ubuntu 16.04, python3.5ubuntu
直接上代碼bash
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib from urllib.request import urlopen import random import os import re my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", ] url_prefix = "http://www.doutula.com/article/list/?page=" referer_prefix = "http://www.doutula.com/article/list/?page=" # 建立保存目錄,建立成功返回Ture, 不成功返回False def getFile(path): if not os.path.exists(path): os.makedirs(path) # 若是建立成功則返回, 不成功則嘗試用默認路徑建立 if os.path.exists(path): print('成功建立目錄 %s' % path) return True else: print('建立目錄 %s 失敗' % path) return False def getPageHTML(url, referer, my_headers): req = urllib.request.Request(url) randdom_header = random.choice(my_headers) req.add_header("User-Agent", randdom_header) req.add_header("Host", "www.doutula.com") req.add_header("Referer", referer) req.add_header("GET", url) # opener = urllib.request.build_opener(urllib.request.ProxyHandler()) # urllib.request.install_opener(opener) response = urllib.request.urlopen(req) content = response.read() return content # 解析HTML頁面信息, 獲取所需信息 def getHTMLElements(pageHTML): result = [] print('正在獲取圖片列表') pattern = re.compile('data-original="(\S*?)jpg!dta', re.S) res = re.findall(pattern, pageHTML.decode('utf-8')) for r in res: r += 'jpg' result.append(r) if result: print('成功獲取圖片列表') return result else: print('獲取圖片列表失敗') return [] # 經過圖片url獲取圖片並保存到指定文件夾中 def getImg(path, imgUrl, filename): print('正在寫入圖片%s' % imgUrl) u = urllib.request.urlopen(imgUrl) data = u.read() # 切換到目錄 os.chdir(path) f = open(filename, 'wb') f.write(data) f.close() os.chdir('../') def log(path, url, referer, my_headers, start_id): print("開始爬取%s頁面" % url) # 頁面HTML pageHTML = getPageHTML(url, referer, my_headers) # 圖片url列表 imageUrl = getHTMLElements(pageHTML) # 保存圖片 # 篩除重複數據 imgs = list(set(imageUrl)) # 圖片名稱初始化 # 對圖片url列表進行遍歷 for img in imgs: # 將圖片url對應的圖片保存入目標文件 getImg(path, img, filename=str(start_id) + '.jpg', ) start_id += 1 print('成功爬取%s頁面, 獲取%d張圖片' % (url, len(imgs))) return start_id # 建立保存目錄 path = "鬥圖" f = getFile(path) page_number = 2 if f: next_start_id = 1 for i in range(1, page_number+1): url = url_prefix + str(i) referer = referer_prefix + str(i-1) next_start_id = log(path, url, referer, my_headers, next_start_id) print('爬蟲完成, 共獲取%d張圖片' % (next_start_id-1)) else: print('建立目錄%s失敗, 中止爬蟲')
結果app
/opt/wwwroot/python/bin/python /opt/wwwroot/doutu.py 成功建立目錄 鬥圖 開始爬取http://www.doutula.com/article/list/?page=1頁面 正在獲取圖片列表 成功獲取圖片列表 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893055_FDICGk.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677576_mgtABy.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677574_yHlqZN.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460735_ZetOWV.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893056_ZNTWxH.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677576_fnZCtR.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893057_tLrIZN.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372281_EVKcFZ.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/18/20190418592845_RyTfwB.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372284_JUoZWm.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/22/20190422893054_gySZih.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372283_MhQrcu.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460736_mkvpPA.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460734_JXsBdD.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372851_XwcOSk.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372852_HQXMkq.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/19/20190419677575_ZWnGrU.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/17/20190417460735_cfWPas.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/18/20190418592850_OcIBWP.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/21/20190421861952_gENdjV.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372850_hySiUj.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372852_UbWHos.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/21/20190421861953_vOqjSn.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372282_DGUWTr.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/16/20190416372805_iDyHdU.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/18/20190418592853_btACyW.jpg 成功爬取http://www.doutula.com/article/list/?page=1頁面, 獲取26張圖片 開始爬取http://www.doutula.com/article/list/?page=2頁面 正在獲取圖片列表 成功獲取圖片列表 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206133_uQsHqG.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943483_sDBjCg.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770790_dNScVk.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681856_hetJiL.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681858_DJmujK.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943482_XLdQza.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681908_iOuyXg.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770790_sLTbja.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770792_knUdlr.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030067_tWBAzg.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303523_PbsrpF.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855261_FjYcIW.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681901_loACKy.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681857_TXbrzq.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303524_moMfct.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943484_FucDtC.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206134_WyVdgc.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681857_FMapOI.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855259_kKgBZS.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030066_OtbdXm.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/09/20190409770792_TPhezJ.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855260_UKQzfX.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/10/20190410855258_CTXjym.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030066_SxMcbA.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681902_cEAQju.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206132_pOBhSb.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303525_SzCjYs.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/14/20190414206133_VWOxBf.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/11/20190411943482_ZuwNnQ.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/08/20190408681900_eGkLNj.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/15/20190415303525_OPsDGl.jpg 正在寫入圖片http://img.doutula.com/production/uploads/image//2019/04/12/20190412030067_JYaPnj.jpg 成功爬取http://www.doutula.com/article/list/?page=2頁面, 獲取32張圖片 爬蟲完成, 共獲取58張圖片 Process finished with exit code 0