An intriguing failing of convolutional neural networks and the CoordConv solution

NeurIPS 2018

NeurIPS 2018github

2019-10-10 15:01:48網絡

Paper app

Official TensorFlow Code ui

Unofficial PyTorch Code spa



機器之心:卷積神經網絡「失陷」,CoordConv 來填坑 翻譯




1. 給定 feature map and 座標(x, y)如何生成對應的 relative CoordinateMap?

The following code is from: [ICCV19] AdaptIS: Adaptive Instance Selection Network


    def get_instances_maps(self, F, points, adaptive_input, controller_input):
        if isinstance(points, mx.nd.NDArray):
            self.num_points = points.shape[1]

        if getattr(self.controller_net, 'return_map', False):
            w = self.eqf(controller_input, points)
            w = self.controller_net(w)

        points = F.reshape(points, shape=(-1, 2))
        x = F.repeat(adaptive_input, self.num_points, axis=0)
        x = self.add_coord_features(x, points)

        x = self.block0(x)
        x = self.adain(x, w)
        x = self.block1(x)

        return x
class AppendCoordFeatures(gluon.HybridBlock):
    def __init__(self, norm_radius, append_dist=True, spatial_scale=1.0):
        super(AppendCoordFeatures, self).__init__()
        self.xs = None
        self.spatial_scale = spatial_scale
        self.norm_radius = norm_radius
        self.append_dist = append_dist

    def _ctx_kwarg(self, x):
        if isinstance(x, mx.nd.NDArray):
            return {"ctx": x.context}
        return {}

    def get_coord_features(self, F, points, rows, cols, batch_size, **ctx_kwarg):
        row_array = F.arange(start=0, stop=rows, step=1, **ctx_kwarg)
        col_array = F.arange(start=0, stop=cols, step=1, **ctx_kwarg)
        coord_rows = F.repeat(F.reshape(row_array, (1, 1, rows, 1)), repeats=cols, axis=3)
        coord_cols = F.repeat(F.reshape(col_array, (1, 1, 1, cols)), repeats=rows, axis=2)

        coord_rows = F.repeat(coord_rows, repeats=batch_size, axis=0)
        coord_cols = F.repeat(coord_cols, repeats=batch_size, axis=0)

        coords = F.concat(coord_rows, coord_cols, dim=1)

        add_xy = F.reshape(points * self.spatial_scale, shape=(0, 0, 1))
        add_xy = F.reshape(F.repeat(add_xy, rows * cols, axis=2),
                           shape=(0, 0, rows, cols))

        coords = (coords - add_xy) / (self.norm_radius * self.spatial_scale)
        if self.append_dist:
            dist = F.sqrt(F.sum(F.square(coords), axis=1, keepdims=1))
            coord_features = F.concat(coords, dist, dim=1)
            coord_features = coords

        coord_features = F.clip(coord_features, a_min=-1, a_max=1)
        return coord_features

    def hybrid_forward(self, F, x, coords):
        if isinstance(x, mx.nd.NDArray):
            self.xs = x.shape

        batch_size, rows, cols = self.xs[0], self.xs[2], self.xs[3]
        coord_features = self.get_coord_features(F, coords, rows, cols, batch_size, **self._ctx_kwarg(x))

        return F.concat(coord_features, x, dim=1)


    def get_coord_features(self, F, points, rows, cols, batch_size, **ctx_kwarg):

        # (Pdb) points, rows, cols, batch_size
        # ([[61. 71.]] <NDArray 1x2 @gpu(0)>, 96, 96, 1)        

        # row_array and col_array: 
        # [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
        #  18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
        #  36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53.
        #  54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70. 71.
        #  72. 73. 74. 75. 76. 77. 78. 79. 80. 81. 82. 83. 84. 85. 86. 87. 88. 89.
        #  90. 91. 92. 93. 94. 95.]
        # <NDArray 96 @gpu(0)>

        # (Pdb) coord_rows
        # [[[[ 0.  0.  0. ...  0.  0.  0.]
        #    [ 1.  1.  1. ...  1.  1.  1.]
        #    [ 2.  2.  2. ...  2.  2.  2.]
        #    ...
        #    [93. 93. 93. ... 93. 93. 93.]
        #    [94. 94. 94. ... 94. 94. 94.]
        #    [95. 95. 95. ... 95. 95. 95.]]]]
        # <NDArray 1x1x96x96 @gpu(0)>

        # (Pdb) coord_cols
        # [[[[ 0.  1.  2. ... 93. 94. 95.]
        #    [ 0.  1.  2. ... 93. 94. 95.]
        #    [ 0.  1.  2. ... 93. 94. 95.]
        #    ...
        #    [ 0.  1.  2. ... 93. 94. 95.]
        #    [ 0.  1.  2. ... 93. 94. 95.]
        #    [ 0.  1.  2. ... 93. 94. 95.]]]]
        # <NDArray 1x1x96x96 @gpu(0)>        

        # (Pdb) add_xy
        # [[[[61. 61. 61. ... 61. 61. 61.]
        #    [61. 61. 61. ... 61. 61. 61.]
        #    [61. 61. 61. ... 61. 61. 61.]
        #    ...
        #    [61. 61. 61. ... 61. 61. 61.]
        #    [61. 61. 61. ... 61. 61. 61.]
        #    [61. 61. 61. ... 61. 61. 61.]]

        #   [[71. 71. 71. ... 71. 71. 71.]
        #    [71. 71. 71. ... 71. 71. 71.]
        #    [71. 71. 71. ... 71. 71. 71.]
        #    ...
        #    [71. 71. 71. ... 71. 71. 71.]
        #    [71. 71. 71. ... 71. 71. 71.]
        #    [71. 71. 71. ... 71. 71. 71.]]]]
        # <NDArray 1x2x96x96 @gpu(0)>    

        # (Pdb) if self.append_dist, then coord_features is: 
        # [[[[-1.         -1.         -1.         ... -1.         -1.
        #     -1.        ]
        #    [-1.         -1.         -1.         ... -1.         -1.
        #     -1.        ]
        #    [-1.         -1.         -1.         ... -1.         -1.
        #     -1.        ]
        #    ...
        #    [ 0.7619048   0.7619048   0.7619048  ...  0.7619048   0.7619048
        #      0.7619048 ]
        #    [ 0.78571427  0.78571427  0.78571427 ...  0.78571427  0.78571427
        #      0.78571427]
        #    [ 0.8095238   0.8095238   0.8095238  ...  0.8095238   0.8095238
        #      0.8095238 ]]

        #   [[-1.         -1.         -1.         ...  0.52380955  0.54761904
        #      0.5714286 ]
        #    [-1.         -1.         -1.         ...  0.52380955  0.54761904
        #      0.5714286 ]
        #    [-1.         -1.         -1.         ...  0.52380955  0.54761904
        #      0.5714286 ]
        #    ...
        #    [-1.         -1.         -1.         ...  0.52380955  0.54761904
        #      0.5714286 ]
        #    [-1.         -1.         -1.         ...  0.52380955  0.54761904
        #      0.5714286 ]
        #    [-1.         -1.         -1.         ...  0.52380955  0.54761904
        #      0.5714286 ]]

        #   [[ 1.          1.          1.         ...  1.          1.
        #      1.        ]
        #    [ 1.          1.          1.         ...  1.          1.
        #      1.        ]
        #    [ 1.          1.          1.         ...  1.          1.
        #      1.        ]
        #    ...
        #    [ 1.          1.          1.         ...  0.9245947   0.9382886
        #      0.95238096]
        #    [ 1.          1.          1.         ...  0.944311    0.9577231
        #      0.9715336 ]
        #    [ 1.          1.          1.         ...  0.96421224  0.97735125
        #      0.99088824]]]]
        # <NDArray 1x3x96x96 @gpu(0)>

        row_array = F.arange(start=0, stop=rows, step=1, **ctx_kwarg)   ## (96,) 
        col_array = F.arange(start=0, stop=cols, step=1, **ctx_kwarg)   ## (96,)
        coord_rows = F.repeat(F.reshape(row_array, (1, 1, rows, 1)), repeats=cols, axis=3)
        coord_cols = F.repeat(F.reshape(col_array, (1, 1, 1, cols)), repeats=rows, axis=2)

        coord_rows = F.repeat(coord_rows, repeats=batch_size, axis=0)
        coord_cols = F.repeat(coord_cols, repeats=batch_size, axis=0)

        coords = F.concat(coord_rows, coord_cols, dim=1)    ## (1, 2, 96, 96) 

        add_xy = F.reshape(points * self.spatial_scale, shape=(0, 0, 1))    ## [[[61.] [71.]]] <NDArray 1x2x1 @gpu(0)>
        add_xy = F.reshape(F.repeat(add_xy, rows * cols, axis=2), shape=(0, 0, rows, cols))

        ## self.norm_radius: 42 
        coords = (coords - add_xy) / (self.norm_radius * self.spatial_scale)    ## <NDArray 1x2x96x96 @gpu(0)> 
        if self.append_dist:
            dist = F.sqrt(F.sum(F.square(coords), axis=1, keepdims=1))  ## <NDArray 1x1x96x96 @gpu(0)>
            coord_features = F.concat(coords, dist, dim=1)
            coord_features = coords

        coord_features = F.clip(coord_features, a_min=-1, a_max=1)
        return coord_features




I also write one PyTorch version according to the MXNet version:

class AddCoords(nn.Module):

    def __init__(self, ):

    def forward(self, input_tensor, points):
        _, x_dim, y_dim = input_tensor.size()
        batch_size = 1 

        xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1)    ## torch.Size([1, 9, 9]) 
        yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose(1, 2)    ## torch.Size([1, 9, 9]) 

        xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)
        yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)

        coords =, yy_channel), dim=1)     ## torch.Size([20, 2, 9, 9])
        coords = coords.type(torch.FloatTensor)

        add_xy = torch.reshape(points, (1, 2, 1))   ## torch.Size([1, 2, 1]) 
        add_xy_ = add_xy.repeat(1, 1, x_dim * y_dim)  ## torch.Size([1, 2, 81])
        add_xy_ = torch.reshape(add_xy_, (1, 2, x_dim, y_dim))  ## torch.Size([1, 2, 9, 9]) 
        add_xy_ = add_xy_.type(torch.FloatTensor)

        coords = (coords - add_xy_)     ## torch.Size([1, 2, 9, 9]) 
        coord_features = np.clip(np.array(coords), -1, 1)   ## (1, 2, 9, 9) 
        coord_features = torch.from_numpy(coord_features).cuda() 

        return coord_features