- 官方Mask-RCNN訓練教程的中文翻譯:手把手教你訓練本身的Mask R-CNN圖像實例分割模型(PyTorch官方教程)
- torchvision自帶的圖像分類、語義分割、目標檢測、實例分割、關鍵點檢測、視頻分類模型:TORCHVISION.MODELS
- torchvision Github項目地址: https://github.com/pytorch/vision
1. 準備工做
除了須要安裝pytorch和torchvision外,還須要安裝COCO的API pycocotools
import torch import os import numpy as np import cv2 import matplotlib.pyplot as plt from torchvision import datasets, transforms from PIL import Image from xml.dom.minidom import parse %matplotlib inline
2. 定義數據集
<annotation> <folder/> <filename></filename> <database/> <annotation/> <image/> <size> <height>1536</height> <width>2048</width> <depth>3</depth> </size> <segmented/> <object> <name>mark_type_1</name> <pose/> <truncated/> <difficult/> <bndbox> <xmin>341.4634146341463</xmin> <ymin>868.2926829268292</ymin> <xmax>813.4146341463414</xmax> <ymax>986.5853658536585</ymax> </bndbox> </object> <object> <name>mark_type_1</name> <pose/> <truncated/> <difficult/> <bndbox> <xmin>1301.2195121951218</xmin> <ymin>815.8536585365853</ymin> <xmax>1769.512195121951</xmax> <ymax>936.5853658536585</ymax> </bndbox> </object> </annotation>
class MarkDataset(torch.utils.data.Dataset): def __init__(self, root, transforms=None): self.root = root self.transforms = transforms # load all image files, sorting them to ensure that they are aligned self.imgs = list(sorted(os.listdir(os.path.join(root, "JPEGImages")))) self.bbox_xml = list(sorted(os.listdir(os.path.join(root, "Annotations")))) def __getitem__(self, idx): # load images and bbox img_path = os.path.join(self.root, "JPEGImages", self.imgs[idx]) bbox_xml_path = os.path.join(self.root, "Annotations", self.bbox_xml[idx]) img = Image.open(img_path).convert("RGB") # 讀取文件,VOC格式的數據集的標註是xml格式的文件 dom = parse(bbox_xml_path) # 獲取文檔元素對象 data = dom.documentElement # 獲取 objects objects = data.getElementsByTagName('object') # get bounding box coordinates boxes = [] labels = [] for object_ in objects: # 獲取標籤中內容 name = object_.getElementsByTagName('name')[0].childNodes[0].nodeValue # 就是label,mark_type_1或mark_type_2 labels.append(np.int(name[-1])) # 背景的label是0,mark_type_1和mark_type_2的label分別是1和2 bndbox = object_.getElementsByTagName('bndbox')[0] xmin = np.float(bndbox.getElementsByTagName('xmin')[0].childNodes[0].nodeValue) ymin = np.float(bndbox.getElementsByTagName('ymin')[0].childNodes[0].nodeValue) xmax = np.float(bndbox.getElementsByTagName('xmax')[0].childNodes[0].nodeValue) ymax = np.float(bndbox.getElementsByTagName('ymax')[0].childNodes[0].nodeValue) boxes.append([xmin, ymin, xmax, ymax]) boxes = torch.as_tensor(boxes, dtype=torch.float32) # there is only one class labels = torch.as_tensor(labels, dtype=torch.int64) image_id = torch.tensor([idx]) area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) # suppose all instances are not crowd iscrowd = torch.zeros((len(objects),), dtype=torch.int64) target = {} target["boxes"] = boxes target["labels"] = labels # 因爲訓練的是目標檢測網絡,所以沒有教程中的target["masks"] = masks target["image_id"] = image_id target["area"] = area target["iscrowd"] = iscrowd if self.transforms is not None: # 注意這裏target(包括bbox)也轉換\加強了,和from torchvision import的transforms的不一樣 # https://github.com/pytorch/vision/tree/master/references/detection 的 transforms.py裏就有RandomHorizontalFlip時target變換的示例 img, target = self.transforms(img, target) return img, target def __len__(self): return len(self.imgs)
3. 定義模型
torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, progress=True, num_classes=3, pretrained_backbone=True)
import torchvision from torchvision.models.detection.faster_rcnn import FastRCNNPredictor def get_object_detection_model(num_classes): # load an object detection model pre-trained on COCO model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # replace the classifier with a new one, that has num_classes which is user-defined num_classes = 3 # 3 class (mark_type_1,mark_type_2) + background # get the number of input features for the classifier in_features = model.roi_heads.box_predictor.cls_score.in_features # replace the pre-trained head with a new one model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) return model
4. 數據加強
在圖像輸入到網絡前,須要對其進行數據加強。這裏須要注意的是,因爲Faster R-CNN模型自己能夠處理歸一化(默認使用ImageNet的均值和標準差來歸一化)及尺度變化的問題,於是無需在這裏進行mean/std normalization或圖像縮放的操做。
因爲from torchvision import的transforms只能對圖片進行數據加強,而沒法同時改變圖片對應的label標籤,所以咱們選擇使用torchvision Github項目中的一些封裝好的用於模型訓練和測試的函數:https://github.com/pytorch/vision/tree/master/references/detection
class RandomHorizontalFlip(object): def __init__(self, prob): self.prob = prob def __call__(self, image, target): if random.random() < self.prob: height, width = image.shape[-2:] image = image.flip(-1) bbox = target["boxes"] bbox[:, [0, 2]] = width - bbox[:, [2, 0]] target["boxes"] = bbox if "masks" in target: target["masks"] = target["masks"].flip(-1) if "keypoints" in target: keypoints = target["keypoints"] keypoints = _flip_coco_person_keypoints(keypoints, width) target["keypoints"] = keypoints return image, target
import utils import transforms as T from engine import train_one_epoch, evaluate # utils、transforms、engine就是剛纔下載下來的utils.py、transforms.py、engine.py def get_transform(train): transforms = [] # converts the image, a PIL image, into a PyTorch Tensor transforms.append(T.ToTensor()) if train: # during training, randomly flip the training images # and ground-truth for data augmentation # 50%的機率水平翻轉 transforms.append(T.RandomHorizontalFlip(0.5)) return T.Compose(transforms)
5. 訓練模型
from engine import train_one_epoch, evaluate import utils root = r'數據集路徑' # train on the GPU or on the CPU, if a GPU is not available device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # 3 classes, mark_type_1,mark_type_2,background num_classes = 3 # use our dataset and defined transformations dataset = MarkDataset(root, get_transform(train=True)) dataset_test = MarkDataset(root, get_transform(train=False)) # split the dataset in train and test set # 個人數據集一共有492張圖,差很少訓練驗證4:1 indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-100]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-100:]) # define training and validation data loaders # 在jupyter notebook裏訓練模型時num_workers參數只能爲0,否則會報錯,這裏就把它註釋掉了 data_loader = torch.utils.data.DataLoader( dataset, batch_size=2, shuffle=True, # num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=2, shuffle=False, # num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, progress=True, num_classes=num_classes, pretrained_backbone=True) # 或get_object_detection_model(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] # SGD optimizer = torch.optim.SGD(params, lr=0.0003, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler # cos學習率 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2) # let's train it for epochs num_epochs = 31 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations # engine.py的train_one_epoch函數將images和targets都.to(device)了 train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=50) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) print('') print('==================================================') print('') print("That's it!")
if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
此外,因爲個人數據集中的bbox的面積都比較大,所以area= small時的AP和AR都爲-1.000
torch.save(model, r'保存路徑\modelname.pkl')
6. 查看效果
def showbbox(model, img): # 輸入的img是0-1範圍的tensor model.eval() with torch.no_grad(): ''' prediction形如: [{'boxes': tensor([[1492.6672, 238.4670, 1765.5385, 315.0320], [ 887.1390, 256.8106, 1154.6687, 330.2953]], device='cuda:0'), 'labels': tensor([1, 1], device='cuda:0'), 'scores': tensor([1.0000, 1.0000], device='cuda:0')}] ''' prediction = model([img.to(device)]) print(prediction) img = img.permute(1,2,0) # C,H,W → H,W,C,用來畫圖 img = (img * 255).byte().data.cpu() # * 255,float轉0-255 img = np.array(img) # tensor → ndarray for i in range(prediction[0]['boxes'].cpu().shape[0]): xmin = round(prediction[0]['boxes'][i][0].item()) ymin = round(prediction[0]['boxes'][i][1].item()) xmax = round(prediction[0]['boxes'][i][2].item()) ymax = round(prediction[0]['boxes'][i][3].item()) label = prediction[0]['labels'][i].item() if label == 1: cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (255, 0, 0), thickness=2) cv2.putText(img, 'mark_type_1', (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), thickness=2) elif label == 2: cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0), thickness=2) cv2.putText(img, 'mark_type_2', (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), thickness=2) plt.figure(figsize=(20,15)) plt.imshow(img)
model = torch.load(r'保存路徑\modelname.pkl') device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) img, _ = dataset_test[0] showbbox(model, img)