YOLOV5訓練與測試時數據加載dataset.py代碼註釋與解析

YOLOV5訓練與測試時數據加載模塊代碼註釋與解析

mosaic加強
矩形訓練
本文主要對ultralytics\yolov5在訓練時的數據加載模塊的dataset.py代碼進行註釋和解析。固然dataset.py中還有其餘時候(例如detect時)所用到的加載方法(例如LoadImages、LoadWebcam等)，本文主要是對訓練時用到的LoadImagesAndLabels類的相關注釋。python
mosaic加強

在這裏要說一下，mosaic數據加強就是將四張圖片拼接在一塊兒傳入網絡訓練，具體能夠查看YOLOV4-mosaic數據加強詳解。（該文章是基於pytorch YOLOV4代碼作的解析）ios
矩形訓練

正方形填充
能夠看到yolov5會對圖片進行填充，填充爲正方形從而傳入網絡進行訓練，能夠看到這裏面有不少冗餘的信息，會讓網絡產生不少無心義的候選框，矩形訓練就是減小這些冗餘信息，減小網絡產生的無心義的框的數量，加快網絡訓練速度。yolov5網絡的總步長爲32，因此其實只要圖片邊長可以整除32就能夠了，不必定徹底須要正方形圖片傳入網絡，矩形訓練就是將圖片填充爲最小的32的倍數邊長，從而減少冗餘信息。

git
矩形填充
值得一提的是，除了矩形訓練，還有矩形推理，也就是在作檢測的時候也這樣填充，從而加快推理速度，減小推理時間。github
import glob
import math
import os
import random
import shutil
import time
from pathlib import Path
from threading import Thread

import cv2
import numpy as np
import torch
from PIL import Image, ExifTags
from torch.utils.data import Dataset
from tqdm import tqdm

from utils.utils import xyxy2xywh, xywh2xyxy

help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
img_formats = ['.bmp', '.jpg', '.jpeg', '.png', '.tif', '.dng']
vid_formats = ['.mov', '.avi', '.mp4', '.mpg', '.mpeg', '.m4v', '.wmv', '.mkv']

# Get orientation exif tag
for orientation in ExifTags.TAGS.keys():
    if ExifTags.TAGS[orientation] == 'Orientation':
        break

# 此函數根據圖片的信息獲取圖片的寬、高信息
def exif_size(img):
    # Returns exif-corrected PIL size
    s = img.size  # (width, height)
    try:
        rotation = dict(img._getexif().items())[orientation]
        if rotation == 6:  # rotation 270
            s = (s[1], s[0])
        elif rotation == 8:  # rotation 90
            s = (s[1], s[0])
    except:
        pass

    return s


# 根據LoadImagesAndLabels建立dataloader
def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False):
    """ 參數解析： path：包含圖片路徑的txt文件或者包含圖片的文件夾路徑 imgsz：網絡輸入圖片大小 batch_size: 批次大小 stride：網絡下采樣最大總步長 opt：調用train.py時傳入的參數，這裏主要用到opt.single_cls，是不是單類數據集 hyp：網絡訓練時的一些超參數，包括學習率等，這裏主要用到裏面一些關於數據加強(旋轉、平移等)的係數 augment：是否進行數據加強 cache：是否提早緩存圖片到內存，以便加快訓練速度 pad：設置矩形訓練的shape時進行的填充 rect：是否進行矩形訓練 """
    dataset = LoadImagesAndLabels(path, imgsz, batch_size,
                                  augment=augment,  # augment images
                                  hyp=hyp,  # augmentation hyperparameters
                                  rect=rect,  # rectangular training
                                  cache_images=cache,
                                  single_cls=opt.single_cls,
                                  stride=int(stride),
                                  pad=pad)

    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             num_workers=nw,
                                             pin_memory=True,
                                             collate_fn=LoadImagesAndLabels.collate_fn)
    return dataloader, dataset




class LoadImagesAndLabels(Dataset):  # for training/testing
    def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
                 cache_images=False, single_cls=False, stride=32, pad=0.0):
        try:
            f = []
            for p in path if isinstance(path, list) else [path]:
                # 獲取數據集路徑path，包含圖片路徑的txt文件或者包含圖片的文件夾路徑
                # 使用pathlib.Path生成與操做系統無關的路徑，由於不一樣操做系統路徑的‘/’會有所不一樣
                p = str(Path(p))  # os-agnostic
                # 獲取數據集路徑的上級父目錄，os.sep爲路徑裏的破折號(不一樣系統路徑破折號不一樣，os.sep根據系統自適應)
                parent = str(Path(p).parent) + os.sep
                # 若是路徑path爲包含圖片路徑的txt文件
                if os.path.isfile(p):  # file
                    with open(p, 'r') as t:
                        # 獲取圖片路徑，更換相對路徑
                        t = t.read().splitlines()
                        f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
                # 若是路徑path爲包含圖片的文件夾路徑
                elif os.path.isdir(p):  # folder
                    f += glob.iglob(p + os.sep + '*.*')
                else:
                    raise Exception('%s does not exist' % p)
            path = p  # *.npy dir
            # 破折號替換爲os.sep，os.path.splitext(x)將文件名與擴展名分開並返回一個列表
            self.img_files = [x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats]
        except Exception as e:
            raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))

        # 數據集的數量
        n = len(self.img_files)
        assert n > 0, 'No images found in %s. See %s' % (path, help_url)
        # 獲取batch的索引
        bi = np.floor(np.arange(n) / batch_size).astype(np.int)  # batch index
        # 一個輪次batch的數量
        nb = bi[-1] + 1  # number of batches

        self.n = n  # number of images
        self.batch = bi  # batch index of image
        self.img_size = img_size  # 輸入圖片分辨率大小
        self.augment = augment  # 數據加強
        self.hyp = hyp  # 超參數
        self.image_weights = image_weights  # 圖片採樣
        self.rect = False if image_weights else rect  # 矩形訓練
        self.mosaic = self.augment and not self.rect  # mosaic數據加強
        self.mosaic_border = [-img_size // 2, -img_size // 2]  # mosaic加強的邊界
        self.stride = stride    # 模型下采樣的總步長

        # 獲取數據集的標籤
        self.label_files = [x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt')
                            for x in self.img_files]

        # 保存圖片shape的路徑
        sp = path.replace('.txt', '') + '.shapes'  # shapefile path
        try:
            # 若是存在該路徑，則讀取
            with open(sp, 'r') as f:  # read existing shapefile
                s = [x.split() for x in f.read().splitlines()]
                assert len(s) == n, 'Shapefile out of sync'
        except:
            # 若是不存在，則讀取圖片shape再保存
            s = [exif_size(Image.open(f)) for f in tqdm(self.img_files, desc='Reading image shapes')]
            np.savetxt(sp, s, fmt='%g')  # overwrites existing (if any)

        self.shapes = np.array(s, dtype=np.float64)

        # Rectangular Training https://github.com/ultralytics/yolov3/issues/232
        # 矩形訓練
        if self.rect:
            # Sort by aspect ratio
            s = self.shapes  # wh
            ar = s[:, 1] / s[:, 0]  # h/w
            # 獲取根據ar從小到大排序的索引
            irect = ar.argsort()
            # 根據索引排序數據集與標籤路徑、shape、h/w
            self.img_files = [self.img_files[i] for i in irect]
            self.label_files = [self.label_files[i] for i in irect]
            self.shapes = s[irect]  # wh
            ar = ar[irect]

            # Set training image shapes
            # 初始化shapes，nb爲一輪批次batch的數量
            shapes = [[1, 1]] * nb
            for i in range(nb):
                ari = ar[bi == i]
                mini, maxi = ari.min(), ari.max()
                # 若是一個batch中最大的h/w小於1，則此batch的shape爲(img_size*maxi, img_size)
                if maxi < 1:
                    shapes[i] = [maxi, 1]
                # 若是一個batch中最小的h/w大於1，則此batch的shape爲(img_size, img_size/mini)
                elif mini > 1:
                    shapes[i] = [1, 1 / mini]

            self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride

        # Cache labels
        # 初始化圖片與標籤，爲緩存圖片、標籤作準備
        self.imgs = [None] * n
        self.labels = [np.zeros((0, 5), dtype=np.float32)] * n
        # 設置是否建立數據子集、提取目標檢測框作再次分類，labels是否已加載
        create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False
        # 漏掉的標籤數量，找到的標籤數量，空的標籤數量，數據子集的數量，相同的標籤數量
        nm, nf, ne, ns, nd = 0, 0, 0, 0, 0  # number missing, found, empty, datasubset, duplicate
        # 保存labels的numpy文件路徑
        np_labels_path = str(Path(self.label_files[0]).parent) + '.npy'  # saved labels in *.npy file
        # 若是存在labels.npy，則直接加載，並設置labels_loaded=True
        if os.path.isfile(np_labels_path):
            s = np_labels_path  # print string
            x = np.load(np_labels_path, allow_pickle=True)
            if len(x) == n:
                self.labels = x
                labels_loaded = True
        else:
            s = path.replace('images', 'labels')

        # 對每個標籤文件作處理
        pbar = tqdm(self.label_files)
        for i, file in enumerate(pbar):
            # 若是labels已經預加載了，直接取出來
            if labels_loaded:
                l = self.labels[i]
                # np.savetxt(file, l, '%g') # save *.txt from *.npy file
            else:
                try:
                    # 讀取標籤txt文件，讀取失敗則nm+1，標籤格式爲：class x y w h
                    with open(file, 'r') as f:
                        l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
                except:
                    nm += 1  # print('missing labels for image %s' % self.img_files[i]) # file missing
                    continue

            if l.shape[0]:
                # 判斷標籤是否有五列
                assert l.shape[1] == 5, '> 5 label columns: %s' % file
                # 判斷標籤是否所有>=0
                assert (l >= 0).all(), 'negative labels: %s' % file
                # 判斷標籤座標x y w h是否歸一化
                assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels: %s' % file
                # 找出標籤中重複的座標
                if np.unique(l, axis=0).shape[0] < l.shape[0]:  # duplicate rows
                    nd += 1  # print('WARNING: duplicate rows in %s' % self.label_files[i]) # duplicate rows
                # 若是數據集只有一個類，設置類別標籤爲0
                if single_cls:
                    l[:, 0] = 0  # force dataset into single-class mode
                self.labels[i] = l
                nf += 1  # file found

                # Create subdataset (a smaller dataset)
                # 建立一個數據子集(默認不調用)
                if create_datasubset and ns < 1E4:
                    # 建立文件夾
                    if ns == 0:
                        create_folder(path='./datasubset')
                        os.makedirs('./datasubset/images')
                    exclude_classes = 43
                    # 保存圖片路徑到本地
                    if exclude_classes not in l[:, 0]:
                        ns += 1
                        # shutil.copy(src=self.img_files[i], dst='./datasubset/images/') # copy image
                        with open('./datasubset/images.txt', 'a') as f:
                            f.write(self.img_files[i] + '\n')

                # Extract object detection boxes for a second stage classifier
                # 獲取目標框與圖片，並將框從圖片截取下來保存到本地(默認不使用)
                if extract_bounding_boxes:
                    p = Path(self.img_files[i])
                    img = cv2.imread(str(p))
                    h, w = img.shape[:2]
                    for j, x in enumerate(l):
                        f = '%s%sclassifier%s%g_%g_%s' % (p.parent.parent, os.sep, os.sep, x[0], j, p.name)
                        if not os.path.exists(Path(f).parent):
                            os.makedirs(Path(f).parent)  # make new output folder

                        # 對歸一化的座標乘以w，h
                        b = x[1:] * [w, h, w, h]  # box
                        b[2:] = b[2:].max()  # rectangle to square
                        b[2:] = b[2:] * 1.3 + 30  # pad
                        # xywh格式轉xyxy
                        b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int)
                        # 修正圖片外的框
                        b[[0, 2]] = np.clip(b[[0, 2]], 0, w)  # clip boxes outside of image
                        b[[1, 3]] = np.clip(b[[1, 3]], 0, h)
                        assert cv2.imwrite(f, img[b[1]:b[3], b[0]:b[2]]), 'Failure extracting classifier boxes'
            else:
                # l.shape[0] == 0則爲空的標籤，ne+1
                ne += 1  # print('empty labels for image %s' % self.img_files[i]) # file empty
                # os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove
            # 顯示信息
            pbar.desc = 'Caching labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % (
                s, nf, nm, ne, nd, n)
        assert nf > 0 or n == 20288, 'No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url)
        # 保存labels到本地
        if not labels_loaded and n > 1000:
            print('Saving labels to %s for faster future loading' % np_labels_path)
            np.save(np_labels_path, self.labels)  # save for next time

        # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
        # 提早緩存圖片到內存中，能夠提高訓練速度
        if cache_images:  # if training
            gb = 0  # Gigabytes of cached images
            pbar = tqdm(range(len(self.img_files)), desc='Caching images')
            self.img_hw0, self.img_hw = [None] * n, [None] * n
            for i in pbar:  # max 10k images
                self.imgs[i], self.img_hw0[i], self.img_hw[i] = load_image(self, i)  # img, hw_original, hw_resized
                gb += self.imgs[i].nbytes
                pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9)

        # Detect corrupted images https://medium.com/joelthchao/programmatically-detect-corrupted-image-8c1b2006c3d3
        # 檢測損壞的圖片文件
        detect_corrupted_images = False
        if detect_corrupted_images:
            from skimage import io  # conda install -c conda-forge scikit-image
            for file in tqdm(self.img_files, desc='Detecting corrupted images'):
                try:
                    _ = io.imread(file)
                except:
                    print('Corrupted image detected: %s' % file)

    def __len__(self):
        return len(self.img_files)

    # def __iter__(self):
    # self.count = -1
    # print('ran dataset iter')
    # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF)
    # return self

    def __getitem__(self, index):
        # 若是image_weights，則獲取新的下標
        if self.image_weights:
            # print(index, self.indices[index])
            """ self.indices在train.py中設置 要配合着train.py中的如下代碼配合使用 image_weights爲根據標籤中每一個類別的數量設置的圖片採樣權重 若是image_weights=True，則根據圖片採樣權重獲取新的下標 # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx """
            index = self.indices[index]
        # 超參數
        hyp = self.hyp

        if self.mosaic:
            # Load mosaic
            # 使用mosaic數據加強方式加載
            img, labels = load_mosaic(self, index)
            shapes = None

        else:
            # Load image
            # 加載圖片並根據設定的輸入大小與圖片原大小的比例ratio進行resize(未作填充pad到正方形)
            img, (h0, w0), (h, w) = load_image(self, index)

            # Letterbox
            # 若是進行矩形訓練，則獲取每一個batch的輸入圖片的shape
            shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  # final letterboxed shape
            # 根據shape對圖片作resize和pad填充，返回resize+pad以後的圖片、縮放因子ratio、填充大小pad
            # 若是未進行矩形訓練，則只進行pad填充到正方形
            img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
            shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling

            # Load labels
            labels = []
            x = self.labels[index]
            if x.size > 0:
                # Normalized xywh to pixel xyxy format
                # 根據pad調整框的標籤座標，並從歸一化的xywh->未歸一化的xyxy
                labels = x.copy()
                labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0]  # pad width
                labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1]  # pad height
                labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0]
                labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1]

        if self.augment:
            # Augment imagespace
            if not self.mosaic:
                # 隨機對圖片進行旋轉，平移，縮放，裁剪
                img, labels = random_affine(img, labels,
                                            degrees=hyp['degrees'],
                                            translate=hyp['translate'],
                                            scale=hyp['scale'],
                                            shear=hyp['shear'])

            # Augment colorspace
            # 隨機改變圖片的色調（H），飽和度（S），亮度（V）
            augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])

            # Apply cutouts
            # if random.random() < 0.9:
            # labels = cutout(img, labels)

        nL = len(labels)  # number of labels
        if nL:
            # 調整框的標籤，xyxy to xywh
            # convert xyxy to xywh
            labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])

            # 從新歸一化標籤0 - 1
            # Normalize coordinates 0 - 1
            labels[:, [2, 4]] /= img.shape[0]  # height
            labels[:, [1, 3]] /= img.shape[1]  # width

        if self.augment:
            # 圖片隨機左右翻轉
            # random left-right flip
            lr_flip = True
            if lr_flip and random.random() < 0.5:
                img = np.fliplr(img)
                if nL:
                    labels[:, 1] = 1 - labels[:, 1]

            # random up-down flip
            # 圖片隨機上下翻轉
            ud_flip = False
            if ud_flip and random.random() < 0.5:
                img = np.flipud(img)
                if nL:
                    labels[:, 2] = 1 - labels[:, 2]

        # 初始化標籤框對應的圖片序號，配合下面的collate_fn使用
        labels_out = torch.zeros((nL, 6))
        if nL:
            labels_out[:, 1:] = torch.from_numpy(labels)

        # Convert
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
        img = np.ascontiguousarray(img)

        return torch.from_numpy(img), labels_out, self.img_files[index], shapes
    """ pytorch的DataLoader打包一個batch的數據集時要通過此函數進行打包 經過重寫此函數實現標籤與圖片對應的劃分，一個batch中哪些標籤屬於哪一張圖片,形如 [[0, 6, 0.5, 0.5, 0.26, 0.35], [0, 6, 0.5, 0.5, 0.26, 0.35], [1, 6, 0.5, 0.5, 0.26, 0.35], [2, 6, 0.5, 0.5, 0.26, 0.35],] 前兩行標籤屬於第一張圖片，第三行屬於第二張。。。 """
    @staticmethod
    def collate_fn(batch):
        img, label, path, shapes = zip(*batch)  # transposed
        for i, l in enumerate(label):
            l[:, 0] = i  # add target image index for build_targets()
        return torch.stack(img, 0), torch.cat(label, 0), path, shapes


def load_image(self, index):
    # loads 1 image from dataset, returns img, original hw, resized hw
    img = self.imgs[index]
    if img is None:  # not cached
        path = self.img_files[index]
        img = cv2.imread(path)  # BGR
        assert img is not None, 'Image Not Found ' + path
        h0, w0 = img.shape[:2]  # orig hw
        r = self.img_size / max(h0, w0)  # resize image to img_size
        # 根據ratio選擇不一樣的插值方式
        if r != 1:  # always resize down, only resize up if training with augmentation
            interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
            img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
        return img, (h0, w0), img.shape[:2]  # img, hw_original, hw_resized
    else:
        return self.imgs[index], self.img_hw0[index], self.img_hw[index]  # img, hw_original, hw_resized


def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5):
    # 隨機取-1到1三個實數，乘以hyp中的hsv三通道的係數
    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
    # 分離通道
    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
    dtype = img.dtype  # uint8

    x = np.arange(0, 256, dtype=np.int16)
    lut_hue = ((x * r[0]) % 180).astype(dtype)
    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)

    # 隨機調整hsv以後從新組合通道
    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
    # 將hsv格式轉爲BGR格式
    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed

    # Histogram equalization
    # if random.random() < 0.2:
    # for i in range(3):
    # img[:, :, i] = cv2.equalizeHist(img[:, :, i])


def load_mosaic(self, index):
    # loads images in a mosaic

    labels4 = []
    s = self.img_size
    # 隨機取mosaic中心點
    yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border]  # mosaic center x, y
    # 隨機取其餘三張圖片的索引
    indices = [index] + [random.randint(0, len(self.labels) - 1) for _ in range(3)]  # 3 additional image indices
    for i, index in enumerate(indices):
        # Load image
        img, _, (h, w) = load_image(self, index)

        # place img in img4
        if i == 0:  # top left
            # 初始化大圖
            img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
            # 設置大圖上的位置（左上角）
            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
            # 選取小圖上的位置
            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
        elif i == 1:  # top right右上角
            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
        elif i == 2:  # bottom left左下角
            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, max(xc, w), min(y2a - y1a, h)
        elif i == 3:  # bottom right右下角
            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)

        # 將小圖上截取的部分貼到大圖上
        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
        # 計算小圖到大圖上時所產生的偏移，用來計算mosaic加強後的標籤框的位置
        padw = x1a - x1b
        padh = y1a - y1b

        # Labels
        x = self.labels[index]
        labels = x.copy()
        # 從新調整標籤框的位置
        if x.size > 0:  # Normalized xywh to pixel xyxy format
            labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw
            labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh
            labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw
            labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh
        labels4.append(labels)

    # Concat/clip labels
    if len(labels4):
        # 調整標籤框在圖片內部
        labels4 = np.concatenate(labels4, 0)
        # np.clip(labels4[:, 1:] - s / 2, 0, s, out=labels4[:, 1:]) # use with center crop
        np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:])  # use with random_affine

        # Replicate
        # img4, labels4 = replicate(img4, labels4)

    # Augment
    # img4 = img4[s // 2: int(s * 1.5), s // 2:int(s * 1.5)] # center crop (WARNING, requires box pruning)
    # print('mosica:', img4.shape)

    # 進行mosaic的時候將四張圖片整合到一塊兒以後shape爲[2*img_size, 2*img_size]
    # 對mosaic整合的圖片進行隨機旋轉、平移、縮放、裁剪，並resize爲輸入大小img_size
    img4, labels4 = random_affine(img4, labels4,
                                  degrees=self.hyp['degrees'],
                                  translate=self.hyp['translate'],
                                  scale=self.hyp['scale'],
                                  shear=self.hyp['shear'],
                                  border=self.mosaic_border)  # border to remove
    # print('mosica:', img4.shape)
    return img4, labels4


def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    # 計算縮放因子
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    """ 縮放(resize)到輸入大小img_size的時候，若是沒有設置上採樣的話，則只進行下采樣 由於上採樣圖片會讓圖片模糊，對訓練不友好影響性能。 """
    if not scaleup:  # only scale down, do not scale up (for better test mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    # 獲取最小的矩形填充
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
    # 若是scaleFill=True,則不進行填充，直接resize成img_size,任由圖片進行拉伸和壓縮
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
    # 計算上下左右填充大小
    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    # 進行填充
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img, ratio, (dw, dh)


# import torchvision
# torchvision.transforms.RandomAffine

def random_affine(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, border=(0, 0)):
    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
    # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
    # targets = [cls, xyxy]

    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
    width = img.shape[1] + border[1] * 2

    # 設置旋轉和縮放的仿射矩陣
    # Rotation and Scale
    R = np.eye(3)
    a = random.uniform(-degrees, degrees)
    # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations
    s = random.uniform(1 - scale, 1 + scale)
    # s = 2 ** random.uniform(-scale, scale)
    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)

    # 設置平移的仿射係數
    # Translation
    T = np.eye(3)
    T[0, 2] = random.uniform(-translate, translate) * img.shape[1] + border[1]  # x translation (pixels)
    T[1, 2] = random.uniform(-translate, translate) * img.shape[0] + border[0]  # y translation (pixels)

    # 設置裁剪的仿射矩陣係數
    # Shear
    S = np.eye(3)
    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)

    # Combined rotation matrix
    # 融合仿射矩陣並做用在圖片上
    M = S @ T @ R  # ORDER IS IMPORTANT HERE!!
    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
        img = cv2.warpAffine(img, M[:2], dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=(114, 114, 114))

    # Transform label coordinates
    # 調整框的標籤
    n = len(targets)
    if n:
        # warp points
        xy = np.ones((n * 4, 3))
        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
        xy = (xy @ M.T)[:, :2].reshape(n, 8)

        # create new boxes
        x = xy[:, [0, 2, 4, 6]]
        y = xy[:, [1, 3, 5, 7]]
        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T

        # # apply angle-based reduction of bounding boxes
        # radians = a * math.pi / 180
        # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
        # x = (xy[:, 2] + xy[:, 0]) / 2
        # y = (xy[:, 3] + xy[:, 1]) / 2
        # w = (xy[:, 2] - xy[:, 0]) * reduction
        # h = (xy[:, 3] - xy[:, 1]) * reduction
        # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T

        # reject warped points outside of image
        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
        w = xy[:, 2] - xy[:, 0]
        h = xy[:, 3] - xy[:, 1]
        area = w * h
        area0 = (targets[:, 3] - targets[:, 1]) * (targets[:, 4] - targets[:, 2])
        ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))  # aspect ratio
        i = (w > 2) & (h > 2) & (area / (area0 * s + 1e-16) > 0.2) & (ar < 20)

        targets = targets[i]
        targets[:, 1:5] = xy[i]

    return img, targets


def create_folder(path='./new_folder'):
    # Create folder
    if os.path.exists(path):
        shutil.rmtree(path)  # delete output folder
    os.makedirs(path)  # make new output folder
以上我根據ultralytics\yolov5的dataset.py代碼，本身的理解，若是有錯，歡迎指正，謝謝。
如今yolov5還在改進，一些代碼隨時會更新，例如train.py等，後續我會更新yolov5其餘代碼的解析和註釋。
緩存