當您訓練機器學習模型時,您真正在作的是調整其參數,以便它能夠將特定輸入(例如,圖像)映射到某個輸出(標籤)。咱們的優化目標是追逐咱們模型損失較低的最佳位置,這種狀況發生在您的參數以正確的方式調整時。git
如今的神經網絡一般具備數百萬的參數,所以,你須要向您的機器學習模型喂入必定比例的示例,以得到良好的性能。此外,您須要的參數數量與模型送執行的任務的複雜程度成正比。github
讓咱們探討幾種最經常使用的圖像加強技術,包括代碼示例和加強後的圖像可視化。從這裏開始,數據將被稱爲圖像。咱們將在全部示例中使用用Python編寫的Tensorflow或OpenCV。如下是咱們將在文章中使用的技術索引:算法
從互聯網收集的圖像將具備不一樣的大小。因爲在大多數神經網絡中存在徹底鏈接的層,因此饋送到網絡的圖像將須要固定大小(除非您在傳遞到密集層以前使用空間金字塔池)。所以,在圖像加強發生以前,讓咱們將圖像預處理到咱們網絡所需的大小。使用固定大小的圖像,咱們能夠得到批量處理它們的好處。網絡
1 import tensorflow as tf 2 import matplotlib.image as mpimg 3 import numpy as np 4 5 IMAGE_SIZE = 224 6 7 def tf_resize_images(X_img_file_paths): 8 X_data = [] 9 tf.reset_default_graph() 10 X = tf.placeholder(tf.float32, (None, None, 3)) 11 tf_img = tf.image.resize_images(X, (IMAGE_SIZE, IMAGE_SIZE), 12 tf.image.ResizeMethod.NEAREST_NEIGHBOR) 13 with tf.Session() as sess: 14 sess.run(tf.global_variables_initializer()) 15 16 # Each image is resized individually as different image may be of different size. 17 for index, file_path in enumerate(X_img_file_paths): 18 img = mpimg.imread(file_path)[:, :, :3] # Do not read alpha channel. 19 resized_img = sess.run(tf_img, feed_dict = {X: img}) 20 X_data.append(resized_img) 21 22 X_data = np.array(X_data, dtype = np.float32) # Convert to numpy 23 return X_data
在圖像中具備不一樣縮放的感興趣對象是圖像多樣性的最重要方面。當您的網絡掌握在真實用戶手中時,圖像中的對象可能很小或很大。此外,有時,物體能夠覆蓋整個圖像,但不會徹底存在於圖像中(即在物體的邊緣處被裁剪)。app
def central_scale_images(X_imgs, scales): # Various settings needed for Tensorflow operation boxes = np.zeros((len(scales), 4), dtype = np.float32) for index, scale in enumerate(scales): x1 = y1 = 0.5 - 0.5 * scale # To scale centrally x2 = y2 = 0.5 + 0.5 * scale boxes[index] = np.array([y1, x1, y2, x2], dtype = np.float32) box_ind = np.zeros((len(scales)), dtype = np.int32) crop_size = np.array([IMAGE_SIZE, IMAGE_SIZE], dtype = np.int32) X_scale_data = [] tf.reset_default_graph() X = tf.placeholder(tf.float32, shape = (1, IMAGE_SIZE, IMAGE_SIZE, 3)) # Define Tensorflow operation for all scales but only one base image at a time tf_img = tf.image.crop_and_resize(X, boxes, box_ind, crop_size) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for img_data in X_imgs: batch_img = np.expand_dims(img_data, axis = 0) scaled_imgs = sess.run(tf_img, feed_dict = {X: batch_img}) X_scale_data.extend(scaled_imgs) X_scale_data = np.array(X_scale_data, dtype = np.float32) return X_scale_data # Produce each image at scaling of 90%, 75% and 60% of original image. scaled_imgs = central_scale_images(X_imgs, [0.90, 0.75, 0.60])
from math import ceil, floor def get_translate_parameters(index): if index == 0: # Translate left 20 percent offset = np.array([0.0, 0.2], dtype = np.float32) size = np.array([IMAGE_SIZE, ceil(0.8 * IMAGE_SIZE)], dtype = np.int32) w_start = 0 w_end = int(ceil(0.8 * IMAGE_SIZE)) h_start = 0 h_end = IMAGE_SIZE elif index == 1: # Translate right 20 percent offset = np.array([0.0, -0.2], dtype = np.float32) size = np.array([IMAGE_SIZE, ceil(0.8 * IMAGE_SIZE)], dtype = np.int32) w_start = int(floor((1 - 0.8) * IMAGE_SIZE)) w_end = IMAGE_SIZE h_start = 0 h_end = IMAGE_SIZE elif index == 2: # Translate top 20 percent offset = np.array([0.2, 0.0], dtype = np.float32) size = np.array([ceil(0.8 * IMAGE_SIZE), IMAGE_SIZE], dtype = np.int32) w_start = 0 w_end = IMAGE_SIZE h_start = 0 h_end = int(ceil(0.8 * IMAGE_SIZE)) else: # Translate bottom 20 percent offset = np.array([-0.2, 0.0], dtype = np.float32) size = np.array([ceil(0.8 * IMAGE_SIZE), IMAGE_SIZE], dtype = np.int32) w_start = 0 w_end = IMAGE_SIZE h_start = int(floor((1 - 0.8) * IMAGE_SIZE)) h_end = IMAGE_SIZE return offset, size, w_start, w_end, h_start, h_end def translate_images(X_imgs): offsets = np.zeros((len(X_imgs), 2), dtype = np.float32) n_translations = 4 X_translated_arr = [] tf.reset_default_graph() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(n_translations): X_translated = np.zeros((len(X_imgs), IMAGE_SIZE, IMAGE_SIZE, 3), dtype = np.float32) X_translated.fill(1.0) # Filling background color base_offset, size, w_start, w_end, h_start, h_end = get_translate_parameters(i) offsets[:, :] = base_offset glimpses = tf.image.extract_glimpse(X_imgs, size, offsets) glimpses = sess.run(glimpses) X_translated[:, h_start: h_start + size[0], \ w_start: w_start + size[1], :] = glimpses X_translated_arr.extend(X_translated) X_translated_arr = np.array(X_translated_arr, dtype = np.float32) return X_translated_arr translated_imgs = translate_images(X_imgs)
def rotate_images(X_imgs): X_rotate = [] tf.reset_default_graph() X = tf.placeholder(tf.float32, shape = (IMAGE_SIZE, IMAGE_SIZE, 3)) k = tf.placeholder(tf.int32) tf_img = tf.image.rot90(X, k = k) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for img in X_imgs: for i in range(3): # Rotation at 90, 180 and 270 degrees rotated_img = sess.run(tf_img, feed_dict = {X: img, k: i + 1}) X_rotate.append(rotated_img) X_rotate = np.array(X_rotate, dtype = np.float32) return X_rotate rotated_imgs = rotate_images(X_imgs)
根據上面的需求,它多是必要的對於各類角度。若是這圖片的背景是一種固定的顏色,新加的顏色須要與背景融合,不然,神經網絡不會將它做爲一種特徵來學習,而這種特徵是沒必要要的。dom
from math import pi def rotate_images(X_imgs, start_angle, end_angle, n_images): X_rotate = [] iterate_at = (end_angle - start_angle) / (n_images - 1) tf.reset_default_graph() X = tf.placeholder(tf.float32, shape = (None, IMAGE_SIZE, IMAGE_SIZE, 3)) radian = tf.placeholder(tf.float32, shape = (len(X_imgs))) tf_img = tf.contrib.image.rotate(X, radian) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for index in range(n_images): degrees_angle = start_angle + index * iterate_at radian_value = degrees_angle * pi / 180 # Convert to radian radian_arr = [radian_value] * len(X_imgs) rotated_imgs = sess.run(tf_img, feed_dict = {X: X_imgs, radian: radian_arr}) X_rotate.extend(rotated_imgs) X_rotate = np.array(X_rotate, dtype = np.float32) return X_rotate # Start rotation at -90 degrees, end at 90 degrees and produce totally 14 images rotated_imgs = rotate_images(X_imgs, -90, 90, 14)
這種狀況對於網絡來講更重要的是消除假設對象的某些特徵僅在特定方面可用的誤差。考慮圖像示例中顯示的狀況。您不但願網絡知道香蕉的傾斜僅發生在基本圖像中觀察到的右側。機器學習
def flip_images(X_imgs): X_flip = [] tf.reset_default_graph() X = tf.placeholder(tf.float32, shape = (IMAGE_SIZE, IMAGE_SIZE, 3)) tf_img1 = tf.image.flip_left_right(X) tf_img2 = tf.image.flip_up_down(X) tf_img3 = tf.image.transpose_image(X) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for img in X_imgs: flipped_imgs = sess.run([tf_img1, tf_img2, tf_img3], feed_dict = {X: img}) X_flip.extend(flipped_imgs) X_flip = np.array(X_flip, dtype = np.float32) return X_flip flipped_images = flip_images(X_imgs)
def add_salt_pepper_noise(X_imgs): # Need to produce a copy as to not modify the original image X_imgs_copy = X_imgs.copy() row, col, _ = X_imgs_copy[0].shape salt_vs_pepper = 0.2 amount = 0.004 num_salt = np.ceil(amount * X_imgs_copy[0].size * salt_vs_pepper) num_pepper = np.ceil(amount * X_imgs_copy[0].size * (1.0 - salt_vs_pepper)) for X_img in X_imgs_copy: # Add Salt noise coords = [np.random.randint(0, i - 1, int(num_salt)) for i in X_img.shape] X_img[coords[0], coords[1], :] = 1 # Add Pepper noise coords = [np.random.randint(0, i - 1, int(num_pepper)) for i in X_img.shape] X_img[coords[0], coords[1], :] = 0 return X_imgs_copy salt_pepper_noise_imgs = add_salt_pepper_noise(X_imgs)
import cv2 def add_gaussian_noise(X_imgs): gaussian_noise_imgs = [] row, col, _ = X_imgs[0].shape # Gaussian distribution parameters mean = 0 var = 0.1 sigma = var ** 0.5 for X_img in X_imgs: gaussian = np.random.random((row, col, 1)).astype(np.float32) gaussian = np.concatenate((gaussian, gaussian, gaussian), axis = 2) gaussian_img = cv2.addWeighted(X_img, 0.75, 0.25 * gaussian, 0.25, 0) gaussian_noise_imgs.append(gaussian_img) gaussian_noise_imgs = np.array(gaussian_noise_imgs, dtype = np.float32) return gaussian_noise_imgs gaussian_noise_imgs = add_gaussian_noise(X_imgs)
def get_mask_coord(imshape): vertices = np.array([[(0.09 * imshape[1], 0.99 * imshape[0]), (0.43 * imshape[1], 0.32 * imshape[0]), (0.56 * imshape[1], 0.32 * imshape[0]), (0.85 * imshape[1], 0.99 * imshape[0])]], dtype = np.int32) return vertices def get_perspective_matrices(X_img): offset = 15 img_size = (X_img.shape[1], X_img.shape[0]) # Estimate the coordinates of object of interest inside the image. src = np.float32(get_mask_coord(X_img.shape)) dst = np.float32([[offset, img_size[1]], [offset, 0], [img_size[0] - offset, 0], [img_size[0] - offset, img_size[1]]]) perspective_matrix = cv2.getPerspectiveTransform(src, dst) return perspective_matrix def perspective_transform(X_img): # Doing only for one type of example perspective_matrix = get_perspective_matrices(X_img) warped_img = cv2.warpPerspective(X_img, perspective_matrix, (X_img.shape[1], X_img.shape[0]), flags = cv2.INTER_LINEAR) return warped_img perspective_img = perspective_transform(X_img)
儘管上面的圖像加強方法列表並不是詳盡無遺,可是包含了許多普遍使用的方法,您能夠組合的使用這些擴充來生成更多的圖像。您能夠在Github中查看本文使用的代碼。ide