『計算機視覺』Mask-RCNN_訓練網絡其一：數據集與Dataset類

時間 2019-11-08

標籤計算機視覺 mask rcnn 訓練網絡其一數據 dataset 欄目系統網絡简体版

原文原文鏈接

本節介紹的數據集class構建爲官方demo，對從零開始構建本身的數據集訓練感興趣的建議瞭解了本文及本文對應的代碼文件後，看一下『計算機視覺』Mask-RCNN_關鍵點檢測分支介紹了由本身的數據構建Mask RCNN可用形式的實踐。html

代碼位置

在腳本train_shapes.ipynb中，做者演示了使用合成圖片進行訓練Mask_RCNN的小demo，咱們將以此爲例，從訓練數據的角度從新審視Mask_RCNN。python

在訓練過程當中，咱們最早要作的根據咱們本身的數據集，集成改寫基礎的數據讀取class：util.py中的Dataset class，而後根據數據集調整網絡配置文件配置config.py中的Config 類，使得網絡形狀配適數，而後再去考慮訓練的問題。按照邏輯流程，本節咱們以train_shapes.ipynb中的數據生成爲例，學習Dataset class的運做機理。git

在示例程序中，首先建立新的Dataset的子類（這裏貼出整個class代碼，後面會分節講解）：github

class ShapesDataset(utils.Dataset):
    """Generates the shapes synthetic dataset. The dataset consists of simple
    shapes (triangles, squares, circles) placed randomly on a blank surface.
    The images are generated on the fly. No file access required.
    """

    def load_shapes(self, count, height, width):
        """Generate the requested number of synthetic images.
        count: number of images to generate.
        height, width: the size of the generated images.
        """
        # Add classes
        self.add_class("shapes", 1, "square")
        self.add_class("shapes", 2, "circle")
        self.add_class("shapes", 3, "triangle")

        # Add images
        # Generate random specifications of images (i.e. color and
        # list of shapes sizes and locations). This is more compact than
        # actual images. Images are generated on the fly in load_image().
        for i in range(count):
            bg_color, shapes = self.random_image(height, width)
            self.add_image("shapes", image_id=i, path=None,
                           width=width, height=height,
                           bg_color=bg_color, shapes=shapes)

    def load_image(self, image_id):
        """Generate an image from the specs of the given image ID.
        Typically this function loads the image from a file, but
        in this case it generates the image on the fly from the
        specs in image_info.
        """
        info = self.image_info[image_id]
        bg_color = np.array(info['bg_color']).reshape([1, 1, 3])
        image = np.ones([info['height'], info['width'], 3], dtype=np.uint8)
        image = image * bg_color.astype(np.uint8)
        for shape, color, dims in info['shapes']:
            image = self.draw_shape(image, shape, dims, color)
        return image

    def image_reference(self, image_id):
        """Return the shapes data of the image."""
        info = self.image_info[image_id]
        if info["source"] == "shapes":
            return info["shapes"]
        else:
            super(self.__class__).image_reference(self, image_id)

    def load_mask(self, image_id):
        """Generate instance masks for shapes of the given image ID.
        """
        info = self.image_info[image_id]
        shapes = info['shapes']
        count = len(shapes)
        mask = np.zeros([info['height'], info['width'], count], dtype=np.uint8)
        for i, (shape, _, dims) in enumerate(info['shapes']):
            mask[:, :, i:i+1] = self.draw_shape(mask[:, :, i:i+1].copy(),
                                                shape, dims, 1)
        # Handle occlusions
        occlusion = np.logical_not(mask[:, :, -1]).astype(np.uint8)
        for i in range(count-2, -1, -1):
            mask[:, :, i] = mask[:, :, i] * occlusion
            occlusion = np.logical_and(occlusion, np.logical_not(mask[:, :, i]))
        # Map class names to class IDs.
        class_ids = np.array([self.class_names.index(s[0]) for s in shapes])
        return mask.astype(np.bool), class_ids.astype(np.int32)

    def draw_shape(self, image, shape, dims, color):
        """Draws a shape from the given specs."""
        # Get the center x, y and the size s
        x, y, s = dims
        if shape == 'square':
            cv2.rectangle(image, (x-s, y-s), (x+s, y+s), color, -1)
        elif shape == "circle":
            cv2.circle(image, (x, y), s, color, -1)
        elif shape == "triangle":
            points = np.array([[(x, y-s),
                                (x-s/math.sin(math.radians(60)), y+s),
                                (x+s/math.sin(math.radians(60)), y+s),
                                ]], dtype=np.int32)
            cv2.fillPoly(image, points, color)
        return image

    def random_shape(self, height, width):
        """Generates specifications of a random shape that lies within
        the given height and width boundaries.
        Returns a tuple of three valus:
        * The shape name (square, circle, ...)
        * Shape color: a tuple of 3 values, RGB.
        * Shape dimensions: A tuple of values that define the shape size
                            and location. Differs per shape type.
        """
        # Shape
        shape = random.choice(["square", "circle", "triangle"])
        # Color
        color = tuple([random.randint(0, 255) for _ in range(3)])
        # Center x, y
        buffer = 20
        y = random.randint(buffer, height - buffer - 1)
        x = random.randint(buffer, width - buffer - 1)
        # Size
        s = random.randint(buffer, height//4)
        return shape, color, (x, y, s)

    def random_image(self, height, width):
        """Creates random specifications of an image with multiple shapes.
        Returns the background color of the image and a list of shape
        specifications that can be used to draw the image.
        """
        # Pick random background color
        bg_color = np.array([random.randint(0, 255) for _ in range(3)])
        # Generate a few random shapes and record their
        # bounding boxes
        shapes = []
        boxes = []
        N = random.randint(1, 4)
        for _ in range(N):
            shape, color, dims = self.random_shape(height, width)
            shapes.append((shape, color, dims))
            x, y, s = dims
            boxes.append([y-s, x-s, y+s, x+s])
        # Apply non-max suppression wit 0.3 threshold to avoid
        # shapes covering each other
        keep_ixs = utils.non_max_suppression(np.array(boxes), np.arange(N), 0.3)
        shapes = [s for i, s in enumerate(shapes) if i in keep_ixs]
        return bg_color, shapes

1、原始數據信息錄入

而後調用以下方法（IMAGE_SHAPE=[128 128 3]，介紹config時會提到），準備訓練用數據和驗證集數據，注意，此時僅僅是在作準備並未真實的生成或讀入圖片數據，算法

# Training dataset
dataset_train = ShapesDataset()
dataset_train.load_shapes(500, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
dataset_train.prepare()

# Validation dataset
dataset_val = ShapesDataset()
dataset_val.load_shapes(50, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
dataset_val.prepare()

其調用的load_shapes方法以下：數組

    def load_shapes(self, count, height, width):
        """Generate the requested number of synthetic images.
        count: number of images to generate.
        height, width: the size of the generated images.
        """
        # Add classes
        self.add_class("shapes", 1, "square")
        self.add_class("shapes", 2, "circle")
        self.add_class("shapes", 3, "triangle")

        # Add images
        # Generate random specifications of images (i.e. color and
        # list of shapes sizes and locations). This is more compact than
        # actual images. Images are generated on the fly in load_image().
        for i in range(count):
            bg_color, shapes = self.random_image(height, width)
            self.add_image("shapes", image_id=i, path=None,
                           width=width, height=height,
                           bg_color=bg_color, shapes=shapes)

這裏涉及了兩個父類繼承來的方法self.add_class和self.add_image，咱們去util.py中的Dataset class看一看，網絡

class Dataset(object):
    """The base class for dataset classes.
    To use it, create a new class that adds functions specific to the dataset
    you want to use. For example:

    class CatsAndDogsDataset(Dataset):
        def load_cats_and_dogs(self):
            ...
        def load_mask(self, image_id):
            ...
        def image_reference(self, image_id):
            ...

    See COCODataset and ShapesDataset as examples.
    """

    def __init__(self, class_map=None):
        self._image_ids = []
        self.image_info = []
        # Background is always the first class
        self.class_info = [{"source": "", "id": 0, "name": "BG"}]
        self.source_class_ids = {}

    def add_class(self, source, class_id, class_name):
        assert "." not in source, "Source name cannot contain a dot"
        # Does the class exist already?
        for info in self.class_info:
            if info['source'] == source and info["id"] == class_id:
                # source.class_id combination already available, skip
                return
        # Add the class
        self.class_info.append({
            "source": source,
            "id": class_id,
            "name": class_name,
        })

    def add_image(self, source, image_id, path, **kwargs):
        image_info = {
            "id": image_id,
            "source": source,
            "path": path,
        }
        image_info.update(kwargs)
        self.image_info.append(image_info)

也就是說，在Dataset中有self.image_info 和 self.class_info 兩個list，它們的元素都是固定key的字典，app

"source"對應數據集名稱，dom

"id"對應本數據集內當前圖片/類別標號

"path"僅image_info含有，對應圖像路徑，可爲None

"name"僅class_info含有，對應類別描述

在後面的prepare方法中咱們能夠進一步瞭解，使用source.id做key，能夠索引到一個內建的新的internal id，這也像咱們解釋了爲何文檔中說Mask_RCNN支持多個數據集同時訓練的由來。

回到load_shapes方法，self.random_image方法爲新建方法，這裏做者使用算法生成圖像作訓練，該方法返回生成圖像函數所需的隨機參數，以後調用add_image時傳入path爲None，也是由於數據並不是從磁盤讀取，而是本身生成，並傳入了額外的self.random_image方法返回的生成參數（咱們沒必要關係具體參數是什麼），做爲字典參數解讀，添加進self.image_info中，

        for i in range(count):
            bg_color, shapes = self.random_image(height, width)
            self.add_image("shapes", image_id=i, path=None,
                           width=width, height=height,
                           bg_color=bg_color, shapes=shapes)

從這裏，咱們進一步瞭解了self.image_info的含義，記錄每一張圖片的id信息（"source"和"id"），記錄每一張圖片的數據信息（如何獲取圖像矩陣的線索，包含"path"或者其餘的字典索引，只要保證後面能實現函數，根據這個信息獲取圖片數據便可）。

2、數據信息整理

在初始化了 self.image_info 和 self.class_info 兩個list以後，Dataset已經記錄了原始的類別信息和圖像信息，調用prepare方法進行規範化，

    def prepare(self, class_map=None):
        """Prepares the Dataset class for use.

        TODO: class map is not supported yet. When done, it should handle mapping
              classes from different datasets to the same class ID.
        """

        def clean_name(name):
            """Returns a shorter version of object names for cleaner display."""
            return ",".join(name.split(",")[:1])

        # Build (or rebuild) everything else from the info dicts.
        self.num_classes = len(self.class_info)                              # 類別數目
        self.class_ids = np.arange(self.num_classes)                         # internal 類別IDs
        self.class_names = [clean_name(c["name"]) for c in self.class_info]  # 類別名簡潔版
        self.num_images = len(self.image_info)                               # 圖片數目
        self._image_ids = np.arange(self.num_images)                         # internal 類別IDs

        # Mapping from source class and image IDs to internal IDs
        self.class_from_source_map = {"{}.{}".format(info['source'], info['id']): id
                                      for info, id in zip(self.class_info, self.class_ids)}
        self.image_from_source_map = {"{}.{}".format(info['source'], info['id']): id
                                      for info, id in zip(self.image_info, self.image_ids)}

        # Map sources to class_ids they support
        self.sources = list(set([i['source'] for i in self.class_info]))
        self.source_class_ids = {}  # source對應的internal 類別IDs
        # Loop over datasets
        for source in self.sources:
            self.source_class_ids[source] = []
            # Find classes that belong to this dataset
            for i, info in enumerate(self.class_info):
                # Include BG class in all datasets
                if i == 0 or source == info['source']:
                    self.source_class_ids[source].append(i)

類別信息記錄

將"source.id"映射爲惟一的internal IDs，並將所有的internal IDs存儲在self.class_ids

source_class_ids，記錄下每個"source"對應的internal IDs

class_from_source_map，記錄下"source.id"：internal IDs的映射關係

print(dataset_train.class_info)  # 每一個類別原始信息
print(dataset_train.class_ids)   # 記錄類別internal IDs
print(dataset_train.source_class_ids)  # 每一個數據集對應的internal IDs
print(dataset_train.class_from_source_map)  # 原始信息和internal ID映射關係

輸出以下：

[{'source': '', 'id': 0, 'name': 'BG'}, 
 {'source': 'shapes', 'id': 1, 'name': 'square'}, 
 {'source': 'shapes', 'id': 2, 'name': 'circle'}, 
 {'source': 'shapes', 'id': 3, 'name': 'triangle'}]
[0 1 2 3]
{'': [0], 'shapes': [0, 1, 2, 3]}
{'.0': 0, 'shapes.1': 1, 'shapes.2': 2, 'shapes.3': 3}

有固定的source爲空的類別0（id和internal ID都是），標記爲背景，會添加進source_class_ids中所有的數據集對應的類別中（上面"shape"數據集咱們僅定義了3個類，在映射中多了一個0變成4個類）。

圖片信息記錄

圖片信息不像類別同樣麻煩，咱們簡單輸出三張，

# Training dataset
dataset_train = ShapesDataset()
dataset_train.load_shapes(3, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
dataset_train.prepare()

print(dataset_train.image_info)  # 記錄圖像原始信息
print(dataset_train.image_ids)   # 記錄圖像internal IDs
print(dataset_train.image_from_source_map)  # 原始信息和internal ID對應關係

結果以下，

[{'id': 0, 'source': 'shapes', 'path': None, 'width': 128, 'height': 128, 'bg_color': array([163, 143, 173]), 
  'shapes': [('circle', (178, 140, 65), (83, 104, 20)), ('circle', (192, 52, 82), (48, 58, 20))]}, 
 {'id': 1, 'source': 'shapes', 'path': None, 'width': 128, 'height': 128, 'bg_color': array([ 5, 99, 71]), 
  'shapes': [('triangle', (90, 32, 55), (39, 21, 22)), ('circle', (214, 49, 173), (39, 78, 21))]}, 
 {'id': 2, 'source': 'shapes', 'path': None, 'width': 128, 'height': 128, 'bg_color': array([138,  52,  83]), 
  'shapes': [('circle', (180, 74, 150), (105, 45, 27))]}]
[0 1 2]
{'shapes.0': 0, 'shapes.1': 1, 'shapes.2': 2}

【注1】因爲這是圖像檢測任務而非圖像分類任務，故每張圖片僅僅和歸屬數據集存在映射，和類別信息沒有直接映射。圖像上的目標和類別才存在映射關係，不過那不在本部分函數涉及範圍內。

【注2】internal IDs實際上就是info的索引數組，使用internal IDs的值能夠直接索引對應圖片順序的info信息。

總結，在調用self.prepare以前，經過本身的新建方法調用self.add_class()和self.add_image()，將圖片和分類的原始信息以dict的形式添加到class_info與image_info兩個list中，便可。

3、獲取圖片

而後咱們獲取一些樣例圖片進行展現，

# Load and display random samples
image_ids = np.random.choice(dataset_train.image_ids, 4)
for image_id in image_ids:
    image = dataset_train.load_image(image_id)
    mask, class_ids = dataset_train.load_mask(image_id)
    visualize.display_top_masks(image, mask, class_ids, dataset_train.class_names)
    print(image.shape, mask.shape, class_ids, dataset_train.class_names)

由上面代碼咱們能夠獲悉以下信息：

使用self.image.ids即internal IDs進行圖片選取

自行實現load_image方法，獲取圖片internal IDs，索引圖片原始信息（info），利用原始信息輸出圖片

自行實現load_mask方法，獲取圖片internal IDs，索引圖片原始信息（info），利用原始信息輸出圖片的masks和對應internal類別，注意一張圖片能夠有多個mask並分別對應本身的類別

上述代碼輸出以下（僅展現前兩張），

下面貼出load_image和load_mask方法（詳見train_shapes.ipynb），具體實現不是重點，畢竟咱們也不是在研究怎麼畫2D圖，重點在於上面提到的它們的功能，這涉及到咱們遷移到本身的數據時如何實現接口。load_image方法返回一張圖片，load_mask方法返回（h，w，c）的01掩碼以及（c，）的class id，注意，c指的是蓋章圖片中instance的數目

    def load_image(self, image_id):
        """Generate an image from the specs of the given image ID.
        Typically this function loads the image from a file, but
        in this case it generates the image on the fly from the
        specs in image_info.
        """
        info = self.image_info[image_id]
        bg_color = np.array(info['bg_color']).reshape([1, 1, 3])
        image = np.ones([info['height'], info['width'], 3], dtype=np.uint8)
        image = image * bg_color.astype(np.uint8)
        for shape, color, dims in info['shapes']:
            image = self.draw_shape(image, shape, dims, color)
        return image


    def load_mask(self, image_id):
        """Generate instance masks for shapes of the given image ID.
        """
        info = self.image_info[image_id]
        shapes = info['shapes']
        count = len(shapes)
        mask = np.zeros([info['height'], info['width'], count], dtype=np.uint8)
        for i, (shape, _, dims) in enumerate(info['shapes']):
            mask[:, :, i:i+1] = self.draw_shape(mask[:, :, i:i+1].copy(),
                                                shape, dims, 1)
        # Handle occlusions
        occlusion = np.logical_not(mask[:, :, -1]).astype(np.uint8)
        for i in range(count-2, -1, -1):
            mask[:, :, i] = mask[:, :, i] * occlusion
            occlusion = np.logical_and(occlusion, np.logical_not(mask[:, :, i]))
        # Map class names to class IDs.
        class_ids = np.array([self.class_names.index(s[0]) for s in shapes])
        return mask.astype(np.bool), class_ids.astype(np.int32)

小結

正如Dataset註釋所說，要想運行本身的數據集，咱們首先要實現一個方法（load_shapes，根據數據集取名便可）收集原始圖像、類別信息，而後實現兩個方法（load_image、load_mask）分別實現獲取單張圖片數據、獲取單張圖片對應的objs的masks和classes，這樣基本完成了數據集類的構建。

The base class for dataset classes.
To use it, create a new class that adds functions specific to the dataset
you want to use. For example:

class CatsAndDogsDataset(Dataset):
 def load_cats_and_dogs(self):
 ...
 def load_mask(self, image_id):
 ...
 def image_reference(self, image_id):
 ...

See COCODataset and ShapesDataset as examples.