===============爬蟲原理==================html
經過Python訪問網站,獲取網站的HTML代碼,經過正則表達式獲取特定的img標籤中src的圖片地址。正則表達式
以後再訪問圖片地址,並經過IO操做將圖片保存到本地。網絡
===============腳本代碼==================dom
import urllib.request # 網絡訪問模塊 import random # 隨機數生成模塊 import re # 正則表達式模塊 import os # 目錄結構處理模塊 # 初始化配置參數 number = 10 # 圖片收集數量 path = 'img/' # 圖片存放目錄 # 文件操做 if not os.path.exists(path): os.makedirs(path) # 圖片保存 def save_img(url, path): message = None try: file = open(path + os.path.basename(url), 'wb') request = urllib.request.urlopen(url) file.write(request.read()) except Exception as e: message = str(e) else: message = os.path.basename(url) finally: if not file.closed: file.close() return message # 網絡鏈接 http = 'http://zerospace.asika.tw/photo/' # 目標網址 position = 290 + int((1000 - number) * random.random()) ids = range(position, position + number) for id in ids: try: url = "%s%d.html" % (http, id) # 後綴生成 request = urllib.request.urlopen(url) except Exception as e: print(e) continue else: buffer = request.read() buffer = buffer.decode('utf8') pattern = 'class="content-img".+\s+.+src="(.+\.jpg)"' imgurl = re.findall(pattern, buffer) # 過濾規則 if len(imgurl) != 0: print(save_img(imgurl[0], path)) else: continue pass
===============運行結果==================網站