MULTI-SCALE CONTEXT AGGREGATION BY DILATED CONVOLUTIONS

采用dilated convolution代替pooling层,关于dilated conviolution的原理,可见:

http://blog.****.net/u011961856/article/details/77141761

the front end module:

输入为三通道padded彩色图像,输出为输出feature map为,output channels为21.采用VGG-16作为dense prediction,但是移除VGG-16的最后两个pooling,striding层,之后的每个卷积层采用dilated convolution,dilations值为2,移除的pooling层厚的卷积层的dilations值为4.

模型代码如下:

def build_frontend_vgg(net, bottom, num_classes):
    prev_layer = bottom
    num_convolutions = [2, 2, 3, 3, 3]
    dilations = [0, 0, 0, 0, 2, 4]
    for l in range(5):
        num_outputs = min(64 * 2 ** l, 512)
        for i in range(0, num_convolutions[l]):
            conv_name = 'conv{0}_{1}'.format(l+1, i+1)
            relu_name = 'relu{0}_{1}'.format(l+1, i+1)
            if dilations[l] == 0:
                setattr(net, conv_name,
                        L.Convolution(
                            prev_layer,
                            param=[dict(lr_mult=1, decay_mult=1),
                                   dict(lr_mult=2, decay_mult=0)],
                            convolution_param=dict(num_output=num_outputs,
                                                   kernel_size=3)))
            else:
                setattr(net, conv_name,
                        L.Convolution(
                            prev_layer,
                            param=[dict(lr_mult=1, decay_mult=1),
                                   dict(lr_mult=2, decay_mult=0)],
                            convolution_param=dict(num_output=num_outputs,
                                                   kernel_size=3,
                                                   dilation=dilations[l])))
            setattr(net, relu_name,
                    L.ReLU(getattr(net, conv_name), in_place=True))
            prev_layer = getattr(net, relu_name)
        if dilations[l+1] == 0:
            pool_name = 'pool{0}'.format(l+1)
            setattr(net, pool_name, L.Pooling(
                prev_layer, pool=P.Pooling.MAX, kernel_size=2, stride=2))
            prev_layer = getattr(net, pool_name)

    net.fc6 = L.Convolution(
        prev_layer,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
        convolution_param=dict(num_output=4096, kernel_size=7,
                               dilation=dilations[5]))
    net.relu6 = L.ReLU(net.fc6, in_place=True)
    net.drop6 = L.Dropout(net.relu6, in_place=True, dropout_ratio=0.5)
    net.fc7 = L.Convolution(
        net.drop6,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
        convolution_param=dict(num_output=4096, kernel_size=1))
    net.relu7 = L.ReLU(net.fc7, in_place=True)
    net.drop7 = L.Dropout(net.relu7, in_place=True, dropout_ratio=0.5)
    net.final = L.Convolution(
        net.drop7,
        param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],
        convolution_param=dict(
            num_output=num_classes, kernel_size=1,
            weight_filler=dict(type='gaussian', std=0.001),
            bias_filler=dict(type='constant', value=0)))
    return net.final, 'final'

效果对比: