resnet_utils.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Contains building blocks for various versions of Residual Networks.
  16. Residual networks (ResNets) were proposed in:
  17. Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  18. Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
  19. More variants were introduced in:
  20. Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  21. Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
  22. We can obtain different ResNet variants by changing the network depth, width,
  23. and form of residual unit. This module implements the infrastructure for
  24. building them. Concrete ResNet units and full ResNet networks are implemented in
  25. the accompanying resnet_v1.py and resnet_v2.py modules.
  26. Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
  27. implementation we subsample the output activations in the last residual unit of
  28. each block, instead of subsampling the input activations in the first residual
  29. unit of each block. The two implementations give identical results but our
  30. implementation is more memory efficient.
  31. """
  32. from __future__ import absolute_import
  33. from __future__ import division
  34. from __future__ import print_function
  35. import collections
  36. import tensorflow as tf
  37. slim = tf.contrib.slim
  38. class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
  39. """A named tuple describing a ResNet block.
  40. Its parts are:
  41. scope: The scope of the `Block`.
  42. unit_fn: The ResNet unit function which takes as input a `Tensor` and
  43. returns another `Tensor` with the output of the ResNet unit.
  44. args: A list of length equal to the number of units in the `Block`. The list
  45. contains one (depth, depth_bottleneck, stride) tuple for each unit in the
  46. block to serve as argument to unit_fn.
  47. """
  48. def subsample(inputs, factor, scope=None):
  49. """Subsamples the input along the spatial dimensions.
  50. Args:
  51. inputs: A `Tensor` of size [batch, height_in, width_in, channels].
  52. factor: The subsampling factor.
  53. scope: Optional variable_scope.
  54. Returns:
  55. output: A `Tensor` of size [batch, height_out, width_out, channels] with the
  56. input, either intact (if factor == 1) or subsampled (if factor > 1).
  57. """
  58. if factor == 1:
  59. return inputs
  60. else:
  61. return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)
  62. def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
  63. """Strided 2-D convolution with 'SAME' padding.
  64. When stride > 1, then we do explicit zero-padding, followed by conv2d with
  65. 'VALID' padding.
  66. Note that
  67. net = conv2d_same(inputs, num_outputs, 3, stride=stride)
  68. is equivalent to
  69. net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
  70. net = subsample(net, factor=stride)
  71. whereas
  72. net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
  73. is different when the input's height or width is even, which is why we add the
  74. current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
  75. Args:
  76. inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
  77. num_outputs: An integer, the number of output filters.
  78. kernel_size: An int with the kernel_size of the filters.
  79. stride: An integer, the output stride.
  80. rate: An integer, rate for atrous convolution.
  81. scope: Scope.
  82. Returns:
  83. output: A 4-D tensor of size [batch, height_out, width_out, channels] with
  84. the convolution output.
  85. """
  86. if stride == 1:
  87. return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate,
  88. padding='SAME', scope=scope)
  89. else:
  90. kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
  91. pad_total = kernel_size_effective - 1
  92. pad_beg = pad_total // 2
  93. pad_end = pad_total - pad_beg
  94. inputs = tf.pad(inputs,
  95. [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
  96. return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride,
  97. rate=rate, padding='VALID', scope=scope)
  98. @slim.add_arg_scope
  99. def stack_blocks_dense(net, blocks, output_stride=None,
  100. outputs_collections=None):
  101. """Stacks ResNet `Blocks` and controls output feature density.
  102. First, this function creates scopes for the ResNet in the form of
  103. 'block_name/unit_1', 'block_name/unit_2', etc.
  104. Second, this function allows the user to explicitly control the ResNet
  105. output_stride, which is the ratio of the input to output spatial resolution.
  106. This is useful for dense prediction tasks such as semantic segmentation or
  107. object detection.
  108. Most ResNets consist of 4 ResNet blocks and subsample the activations by a
  109. factor of 2 when transitioning between consecutive ResNet blocks. This results
  110. to a nominal ResNet output_stride equal to 8. If we set the output_stride to
  111. half the nominal network stride (e.g., output_stride=4), then we compute
  112. responses twice.
  113. Control of the output feature density is implemented by atrous convolution.
  114. Args:
  115. net: A `Tensor` of size [batch, height, width, channels].
  116. blocks: A list of length equal to the number of ResNet `Blocks`. Each
  117. element is a ResNet `Block` object describing the units in the `Block`.
  118. output_stride: If `None`, then the output will be computed at the nominal
  119. network stride. If output_stride is not `None`, it specifies the requested
  120. ratio of input to output spatial resolution, which needs to be equal to
  121. the product of unit strides from the start up to some level of the ResNet.
  122. For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
  123. then valid values for the output_stride are 1, 2, 6, 24 or None (which
  124. is equivalent to output_stride=24).
  125. outputs_collections: Collection to add the ResNet block outputs.
  126. Returns:
  127. net: Output tensor with stride equal to the specified output_stride.
  128. Raises:
  129. ValueError: If the target output_stride is not valid.
  130. """
  131. # The current_stride variable keeps track of the effective stride of the
  132. # activations. This allows us to invoke atrous convolution whenever applying
  133. # the next residual unit would result in the activations having stride larger
  134. # than the target output_stride.
  135. current_stride = 1
  136. # The atrous convolution rate parameter.
  137. rate = 1
  138. for block in blocks:
  139. with tf.variable_scope(block.scope, 'block', [net]) as sc:
  140. for i, unit in enumerate(block.args):
  141. if output_stride is not None and current_stride > output_stride:
  142. raise ValueError('The target output_stride cannot be reached.')
  143. with tf.variable_scope('unit_%d' % (i + 1), values=[net]):
  144. # If we have reached the target output_stride, then we need to employ
  145. # atrous convolution with stride=1 and multiply the atrous rate by the
  146. # current unit's stride for use in subsequent layers.
  147. if output_stride is not None and current_stride == output_stride:
  148. net = block.unit_fn(net, rate=rate, **dict(unit, stride=1))
  149. rate *= unit.get('stride', 1)
  150. else:
  151. net = block.unit_fn(net, rate=1, **unit)
  152. current_stride *= unit.get('stride', 1)
  153. net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net)
  154. if output_stride is not None and current_stride != output_stride:
  155. raise ValueError('The target output_stride cannot be reached.')
  156. return net
  157. def resnet_arg_scope(weight_decay=0.0001,
  158. batch_norm_decay=0.997,
  159. batch_norm_epsilon=1e-5,
  160. batch_norm_scale=True,
  161. activation_fn=tf.nn.relu,
  162. use_batch_norm=True):
  163. """Defines the default ResNet arg scope.
  164. TODO(gpapan): The batch-normalization related default values above are
  165. appropriate for use in conjunction with the reference ResNet models
  166. released at https://github.com/KaimingHe/deep-residual-networks. When
  167. training ResNets from scratch, they might need to be tuned.
  168. Args:
  169. weight_decay: The weight decay to use for regularizing the model.
  170. batch_norm_decay: The moving average decay when estimating layer activation
  171. statistics in batch normalization.
  172. batch_norm_epsilon: Small constant to prevent division by zero when
  173. normalizing activations by their variance in batch normalization.
  174. batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
  175. activations in the batch normalization layer.
  176. activation_fn: The activation function which is used in ResNet.
  177. use_batch_norm: Whether or not to use batch normalization.
  178. Returns:
  179. An `arg_scope` to use for the resnet models.
  180. """
  181. batch_norm_params = {
  182. 'decay': batch_norm_decay,
  183. 'epsilon': batch_norm_epsilon,
  184. 'scale': batch_norm_scale,
  185. 'updates_collections': tf.GraphKeys.UPDATE_OPS,
  186. 'fused': None, # Use fused batch norm if possible.
  187. }
  188. with slim.arg_scope(
  189. [slim.conv2d],
  190. weights_regularizer=slim.l2_regularizer(weight_decay),
  191. weights_initializer=slim.variance_scaling_initializer(),
  192. activation_fn=activation_fn,
  193. normalizer_fn=slim.batch_norm if use_batch_norm else None,
  194. normalizer_params=batch_norm_params):
  195. with slim.arg_scope([slim.batch_norm], **batch_norm_params):
  196. # The following implies padding='SAME' for pool1, which makes feature
  197. # alignment easier for dense prediction tasks. This is also used in
  198. # https://github.com/facebook/fb.resnet.torch. However the accompanying
  199. # code of 'Deep Residual Learning for Image Recognition' uses
  200. # padding='VALID' for pool1. You can switch to that choice by setting
  201. # slim.arg_scope([slim.max_pool2d], padding='VALID').
  202. with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc:
  203. return arg_sc