pytorch實現目標檢測目標檢測算法首先要實現數據的讀入,即實現Dataset
和DataLoader
兩個類。
藉助pycocotools
實現了CoCo2017用於目標檢測數據的讀取,並使用cv2
顯示。html
使用cv2
顯示讀入數據,或者要送入到網絡的數據應該有三個部分python
在目標檢測中,通常將圖像進行縮放,使其尺寸知足必定要求,具體能夠參考以前的博客。也就是要實現一個Resizer()
的類進行變換。此外,一般要對圖像進行標準化處理,以及水平翻轉等變換。所以,在實現Dataset時要實現的變換有三個: Resizer()
、Normilizer()
和Augmenter()
。算法
Python中圖像數據讀入通常都是 nChanns x H x W的numpy數組。常規的作法是使用Dataset
中的transform
對數據進行轉換,輸出torch類型的數組。json
因爲CoCo數據集中圖像的尺寸不一致,不能直接得到Nx3xHeight x Width類型的數組,所以要重寫DataLoader
中的collate_fn
,將一個minibatch中的圖像尺寸調整一致。若是想要按照圖像被縮放比例進行採樣,就要重寫DataLoader
中的batch_sampler
,
batch_sampler
與DataLoader
中的batch_size, shuffle, sampler, and drop_last
參數是不兼容的,即在DataLoader
中使用了batch_sampler
,參數就不能再設置batch_size, shuffle, sampler, and drop_last
參數。數組
coco.getImgIds()
返回了圖像索引數組,能夠分別結合coco.loadImgs()
和coco.getAnnIds()
分別得到圖像、BBs和類型的具體信息。
要注意的事情有:網絡
下面就是一個簡單的SimpleCoCoDataset
類app
class SimpleCoCoDataset(Dataset): def __init__(self, rootdir, set_name='val2017', transform=None): self.rootdir, self.set_name = rootdir, set_name self.transform = transform self.coco = COCO(os.path.join(self.rootdir, 'annotations', 'instances_' + self.set_name + '.json')) self.image_ids = self.coco.getImgIds() self.load_classes() def load_classes(self): categories = self.coco.loadCats(self.coco.getCatIds()) categories.sort(key=lambda x: x['id']) # coco ids is not from 1, and not continue # make a new index from 0 to 79, continuely # classes: {names: new_index} # coco_labels: {new_index: coco_index} # coco_labels_inverse: {coco_index: new_index} self.classes, self.coco_labels, self.coco_labels_inverse = {}, {}, {} for c in categories: self.coco_labels[len(self.classes)] = c['id'] self.coco_labels_inverse[c['id']] = len(self.classes) self.classes[c['name']] = len(self.classes) # labels: {new_index: names} self.labels = {} for k, v in self.classes.items(): self.labels[v] = k def __len__(self): return len(self.image_ids) def __getitem__(self, index): img = self.load_image(index) ann = self.load_anns(index) sample = {'img':img, 'ann': ann} if self.transform: sample = self.transform(sample) return sample def load_image(self, index): image_info = self.coco.loadImgs(self.image_ids[index])[0] imgpath = os.path.join(self.rootdir, 'images', self.set_name, image_info['file_name']) img = skimage.io.imread(imgpath) return img.astype(np.float32) / 255.0 def load_anns(self, index): annotation_ids = self.coco.getAnnIds(self.image_ids[index], iscrowd=False) # anns is num_anns x 5, (x1, x2, y1, y2, new_idx) anns = np.zeros((0, 5)) # skip the image without annoations if len(annotation_ids) == 0: return anns coco_anns = self.coco.loadAnns(annotation_ids) for a in coco_anns: # skip the annotations with width or height < 1 if a['bbox'][2] < 1 or a['bbox'][3] < 1: continue ann = np.zeros((1, 5)) ann[0, :4] = a['bbox'] ann[0, 4] = self.coco_labels_inverse[a['category_id']] anns = np.append(anns, ann, axis=0) # (x1, y1, width, height) --> (x1, y1, x2, y2) anns[:, 2] += anns[:, 0] anns[:, 3] += anns[:, 1] return anns def image_aspect_ratio(self, index): image = self.coco.loadImgs(self.image_ids[index])[0] return float(image['width']) / float(image['height'])
實現了兩種transform類型, Resizer()
和Normilizer()
。數據的均值爲[0.485, 0.456, 0.406]
,方差爲:[0.229, 0.224, 0.225]
。利用數組廣播機制能夠很容易寫出Normilizer()
:dom
class Normilizer(object): def __init__(self): self.mean = np.array([[[0.485, 0.456, 0.406]]], dtype=np.float32) self.std = np.array([[[0.229, 0.224, 0.225]]], dtype=np.float32) def __call__(self, sample): image, anns = sample['img'], sample['ann'] return {'img':(image.astype(np.float32)-self.mean)/ self.std, 'ann':anns}
Resizer()
類要返回原圖片被放縮的倍數。字體
class Resizer(): def __call__(self, sample, targetSize=608, maxSize=1024, pad_N=32): image, anns = sample['img'], sample['ann'] rows, cols = image.shape[:2] smaller_size, larger_size = min(rows, cols), max(rows, cols) scale = targetSize / smaller_size if larger_size * scale > maxSize: scale = maxSize / larger_size image = skimage.transform.resize(image.astype(np.float64), (int(round(rows*scale)), int(round(cols*scale))), mode='constant') rows, cols, cns = image.shape[:3] # 填補放縮後的圖片,並使其尺寸爲32的整倍數 pad_w, pad_h = (pad_N - cols % pad_N), (pad_N - rows % pad_N) new_image = np.zeros((rows + pad_h, cols + pad_w, cns)).astype(np.float32) new_image[:rows, :cols, :] = image.astype(np.float32) anns[:, :4] *= scale return {'img': torch.from_numpy(new_image), 'ann': torch.from_numpy(anns), 'scale':scale}
batch_sampler 提供了從Dataset中進行採樣的方法,咱們按照原始圖像尺寸比例進行排序進行採樣。這個類要集成torch.utils.data.Sampler
類,並實現__len__()
和__iter__()
兩個方法。ui
drop_last
參數是指當數據集中樣本個數不能被batch_size
整除時,不能組成完整minibatch樣本的處理方式,具體能夠經過處理__len__()
方法控制長度實現。
class AspectRatioBasedSampler(Sampler): def __init__(self, dataset, batch_size, drop_last): self.dataset = dataset self.batch_size = batch_size self.drop_last = drop_last self.groups = self.group_images() def group_images(self): order = list(range(len(self.dataset))) order.sort(key=lambda x: self.dataset.image_aspect_ratio(x)) return [[order[x % len(order)] for x in range(i, i+self.batch_size)] for i in range(0, len(order), self.batch_size)] def __iter__(self): random.shuffle(self.groups) for group in self.groups: yield group def __len__(self): if self.drop_last: return len(self.dataset) // self.batch_size else: return (len(self.dataset) + self.batch_size - 1) // self.batch_size
經過batch_sampler
採樣獲得的樣本數據,其圖像尺寸可能不徹底一致,這時就須要用到collate_fn
參數指定被採樣樣本圖片尺寸的調整方式。一般的作法是,得到這組樣本中圖片尺寸的最大值 \(Width_{max}\)和$Height_{max} $,而後將改組樣本中全部圖像的尺寸調整 $ Height_{max}\times Width_{max} $ 最終返回圖像數據爲: $ BatchSize\times Height_{max}\times Width_{max}\times 3 $
此外,每一個樣本中的BBs的數量也可能不一樣,設BBs數量最大值爲 \(Ann_{max}\) ,也要將標籤和類型尺寸調整相同,對於BBs小於 \(Ann_{max}\) 的樣本,補充-1。最終返回標籤數據爲:\(BatchSize\times Ann_{max}\times 5\)
def collater(data): imgs = [s['img'] for s in data] annots = [s['annot'] for s in data] scales = [s['scale'] for s in data] widths = [int(s.shape[0]) for s in imgs] heights = [int(s.shape[1]) for s in imgs] batch_size = len(imgs) max_width = np.array(widths).max() max_height = np.array(heights).max() padded_imgs = torch.zeros(batch_size, max_width, max_height, 3) for i in range(batch_size): img = imgs[i] padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img max_num_annots = max(annot.shape[0] for annot in annots) if max_num_annots > 0: annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 if max_num_annots > 0: for idx, annot in enumerate(annots): #print(annot.shape) if annot.shape[0] > 0: annot_padded[idx, :annot.shape[0], :] = annot else: annot_padded = torch.ones((len(annots), 1, 5)) * -1 padded_imgs = padded_imgs.permute(0, 3, 1, 2) return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales}
使用cv2
實現了數據的顯示。要注意從DataLoader
中獲得的數據是三部分的:
{'img': torch.tensor((batch_size, height, width, 3)), 'ann': torch.tensor((batch_size, num_ann, 5), 'scale': scalar }
其中‘ann'的第五列是類型索引,須要結合SimpleCoCoDataset
類中的self.labels
得到對應的類型。
def my_coco_show(samples, labels): image, anns, scales = samples['img'].numpy(), samples['ann'].numpy(), samples['scale'] imgIdx = 1 for img, ann, scale in zip(image, anns, scales): # 去掉補充的-1 ann = ann[ann[:, 4] != -1] if ann.shape[0] == 0: continue # 經過類型索引得到類型 classes = [] for idx in ann[:, 4]: classes.append(labels[int(idx)]) # 反標準化 img = np.transpose(img, (1, 2, 0)) img = img * np.array([[[0.229, 0.224, 0.225]]]) + np.array([[[0.485, 0.456, 0.406]]]) for idx in range(ann.shape[0]): p1 = (int(round(ann[idx, 0])), int(round(ann[idx, 1]))) p2 = (int(round(ann[idx, 2])), int(round(ann[idx, 3]))) cv2.rectangle(img, p1,p2, (255, 0, 0), 2) # 圖像,文字內容, 座標 ,字體,大小,顏色,字體厚度 cv2.putText(img, classes[idx], (p2[0] - 40, p2[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, 8) winName = str(imgIdx) cv2.namedWindow(winName, cv2.WINDOW_AUTOSIZE) cv2.moveWindow(winName, 10, 10) cv2.imshow(winName, img[:,:,::-1]) cv2.waitKey(0) cv2.destroyWindow(winName) imgIdx += 1