vgg.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Contains model definitions for versions of the Oxford VGG network.
  16. These model definitions were introduced in the following technical report:
  17. Very Deep Convolutional Networks For Large-Scale Image Recognition
  18. Karen Simonyan and Andrew Zisserman
  19. arXiv technical report, 2015
  20. PDF: http://arxiv.org/pdf/1409.1556.pdf
  21. ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  22. CC-BY-4.0
  23. More information can be obtained from the VGG website:
  24. www.robots.ox.ac.uk/~vgg/research/very_deep/
  25. Usage:
  26. with slim.arg_scope(vgg.vgg_arg_scope()):
  27. outputs, end_points = vgg.vgg_a(inputs)
  28. with slim.arg_scope(vgg.vgg_arg_scope()):
  29. outputs, end_points = vgg.vgg_16(inputs)
  30. @@vgg_a
  31. @@vgg_16
  32. @@vgg_19
  33. """
  34. from __future__ import absolute_import
  35. from __future__ import division
  36. from __future__ import print_function
  37. import tensorflow as tf
  38. slim = tf.contrib.slim
  39. def vgg_arg_scope(weight_decay=0.0005):
  40. """Defines the VGG arg scope.
  41. Args:
  42. weight_decay: The l2 regularization coefficient.
  43. Returns:
  44. An arg_scope.
  45. """
  46. with slim.arg_scope([slim.conv2d, slim.fully_connected],
  47. activation_fn=tf.nn.relu,
  48. weights_regularizer=slim.l2_regularizer(weight_decay),
  49. biases_initializer=tf.zeros_initializer()):
  50. with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc:
  51. return arg_sc
  52. def vgg_a(inputs,
  53. num_classes=1000,
  54. is_training=True,
  55. dropout_keep_prob=0.5,
  56. spatial_squeeze=True,
  57. scope='vgg_a',
  58. fc_conv_padding='VALID',
  59. global_pool=False):
  60. """Oxford Net VGG 11-Layers version A Example.
  61. Note: All the fully_connected layers have been transformed to conv2d layers.
  62. To use in classification mode, resize input to 224x224.
  63. Args:
  64. inputs: a tensor of size [batch_size, height, width, channels].
  65. num_classes: number of predicted classes. If 0 or None, the logits layer is
  66. omitted and the input features to the logits layer are returned instead.
  67. is_training: whether or not the model is being trained.
  68. dropout_keep_prob: the probability that activations are kept in the dropout
  69. layers during training.
  70. spatial_squeeze: whether or not should squeeze the spatial dimensions of the
  71. outputs. Useful to remove unnecessary dimensions for classification.
  72. scope: Optional scope for the variables.
  73. fc_conv_padding: the type of padding to use for the fully connected layer
  74. that is implemented as a convolutional layer. Use 'SAME' padding if you
  75. are applying the network in a fully convolutional manner and want to
  76. get a prediction map downsampled by a factor of 32 as an output.
  77. Otherwise, the output prediction map will be (input / 32) - 6 in case of
  78. 'VALID' padding.
  79. global_pool: Optional boolean flag. If True, the input to the classification
  80. layer is avgpooled to size 1x1, for any input size. (This is not part
  81. of the original VGG architecture.)
  82. Returns:
  83. net: the output of the logits layer (if num_classes is a non-zero integer),
  84. or the input to the logits layer (if num_classes is 0 or None).
  85. end_points: a dict of tensors with intermediate activations.
  86. """
  87. with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc:
  88. end_points_collection = sc.original_name_scope + '_end_points'
  89. # Collect outputs for conv2d, fully_connected and max_pool2d.
  90. with slim.arg_scope([slim.conv2d, slim.max_pool2d],
  91. outputs_collections=end_points_collection):
  92. net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1')
  93. net = slim.max_pool2d(net, [2, 2], scope='pool1')
  94. net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2')
  95. net = slim.max_pool2d(net, [2, 2], scope='pool2')
  96. net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3')
  97. net = slim.max_pool2d(net, [2, 2], scope='pool3')
  98. net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4')
  99. net = slim.max_pool2d(net, [2, 2], scope='pool4')
  100. net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5')
  101. net = slim.max_pool2d(net, [2, 2], scope='pool5')
  102. # Use conv2d instead of fully_connected layers.
  103. net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
  104. net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
  105. scope='dropout6')
  106. net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
  107. # Convert end_points_collection into a end_point dict.
  108. end_points = slim.utils.convert_collection_to_dict(end_points_collection)
  109. if global_pool:
  110. net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
  111. end_points['global_pool'] = net
  112. if num_classes:
  113. net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
  114. scope='dropout7')
  115. net = slim.conv2d(net, num_classes, [1, 1],
  116. activation_fn=None,
  117. normalizer_fn=None,
  118. scope='fc8')
  119. if spatial_squeeze:
  120. net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
  121. end_points[sc.name + '/fc8'] = net
  122. return net, end_points
  123. vgg_a.default_image_size = 224
  124. def vgg_16(inputs,
  125. num_classes=1000,
  126. is_training=True,
  127. dropout_keep_prob=0.5,
  128. spatial_squeeze=True,
  129. scope='vgg_16',
  130. fc_conv_padding='VALID',
  131. global_pool=False):
  132. """Oxford Net VGG 16-Layers version D Example.
  133. Note: All the fully_connected layers have been transformed to conv2d layers.
  134. To use in classification mode, resize input to 224x224.
  135. Args:
  136. inputs: a tensor of size [batch_size, height, width, channels].
  137. num_classes: number of predicted classes. If 0 or None, the logits layer is
  138. omitted and the input features to the logits layer are returned instead.
  139. is_training: whether or not the model is being trained.
  140. dropout_keep_prob: the probability that activations are kept in the dropout
  141. layers during training.
  142. spatial_squeeze: whether or not should squeeze the spatial dimensions of the
  143. outputs. Useful to remove unnecessary dimensions for classification.
  144. scope: Optional scope for the variables.
  145. fc_conv_padding: the type of padding to use for the fully connected layer
  146. that is implemented as a convolutional layer. Use 'SAME' padding if you
  147. are applying the network in a fully convolutional manner and want to
  148. get a prediction map downsampled by a factor of 32 as an output.
  149. Otherwise, the output prediction map will be (input / 32) - 6 in case of
  150. 'VALID' padding.
  151. global_pool: Optional boolean flag. If True, the input to the classification
  152. layer is avgpooled to size 1x1, for any input size. (This is not part
  153. of the original VGG architecture.)
  154. Returns:
  155. net: the output of the logits layer (if num_classes is a non-zero integer),
  156. or the input to the logits layer (if num_classes is 0 or None).
  157. end_points: a dict of tensors with intermediate activations.
  158. """
  159. with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
  160. end_points_collection = sc.original_name_scope + '_end_points'
  161. # Collect outputs for conv2d, fully_connected and max_pool2d.
  162. with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
  163. outputs_collections=end_points_collection):
  164. net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
  165. net = slim.max_pool2d(net, [2, 2], scope='pool1')
  166. net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
  167. net = slim.max_pool2d(net, [2, 2], scope='pool2')
  168. net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
  169. net = slim.max_pool2d(net, [2, 2], scope='pool3')
  170. net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
  171. net = slim.max_pool2d(net, [2, 2], scope='pool4')
  172. net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
  173. net = slim.max_pool2d(net, [2, 2], scope='pool5')
  174. # Use conv2d instead of fully_connected layers.
  175. net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
  176. net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
  177. scope='dropout6')
  178. net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
  179. # Convert end_points_collection into a end_point dict.
  180. end_points = slim.utils.convert_collection_to_dict(end_points_collection)
  181. if global_pool:
  182. net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
  183. end_points['global_pool'] = net
  184. if num_classes:
  185. net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
  186. scope='dropout7')
  187. net = slim.conv2d(net, num_classes, [1, 1],
  188. activation_fn=None,
  189. normalizer_fn=None,
  190. scope='fc8')
  191. if spatial_squeeze and num_classes is not None:
  192. net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
  193. end_points[sc.name + '/fc8'] = net
  194. return net, end_points
  195. vgg_16.default_image_size = 224
  196. def vgg_19(inputs,
  197. num_classes=1000,
  198. is_training=True,
  199. dropout_keep_prob=0.5,
  200. spatial_squeeze=True,
  201. scope='vgg_19',
  202. fc_conv_padding='VALID',
  203. global_pool=False):
  204. """Oxford Net VGG 19-Layers version E Example.
  205. Note: All the fully_connected layers have been transformed to conv2d layers.
  206. To use in classification mode, resize input to 224x224.
  207. Args:
  208. inputs: a tensor of size [batch_size, height, width, channels].
  209. num_classes: number of predicted classes. If 0 or None, the logits layer is
  210. omitted and the input features to the logits layer are returned instead.
  211. is_training: whether or not the model is being trained.
  212. dropout_keep_prob: the probability that activations are kept in the dropout
  213. layers during training.
  214. spatial_squeeze: whether or not should squeeze the spatial dimensions of the
  215. outputs. Useful to remove unnecessary dimensions for classification.
  216. scope: Optional scope for the variables.
  217. fc_conv_padding: the type of padding to use for the fully connected layer
  218. that is implemented as a convolutional layer. Use 'SAME' padding if you
  219. are applying the network in a fully convolutional manner and want to
  220. get a prediction map downsampled by a factor of 32 as an output.
  221. Otherwise, the output prediction map will be (input / 32) - 6 in case of
  222. 'VALID' padding.
  223. global_pool: Optional boolean flag. If True, the input to the classification
  224. layer is avgpooled to size 1x1, for any input size. (This is not part
  225. of the original VGG architecture.)
  226. Returns:
  227. net: the output of the logits layer (if num_classes is a non-zero integer),
  228. or the non-dropped-out input to the logits layer (if num_classes is 0 or
  229. None).
  230. end_points: a dict of tensors with intermediate activations.
  231. """
  232. with tf.variable_scope(scope, 'vgg_19', [inputs]) as sc:
  233. end_points_collection = sc.original_name_scope + '_end_points'
  234. # Collect outputs for conv2d, fully_connected and max_pool2d.
  235. with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
  236. outputs_collections=end_points_collection):
  237. net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
  238. net = slim.max_pool2d(net, [2, 2], scope='pool1')
  239. net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
  240. net = slim.max_pool2d(net, [2, 2], scope='pool2')
  241. net = slim.repeat(net, 4, slim.conv2d, 256, [3, 3], scope='conv3')
  242. net = slim.max_pool2d(net, [2, 2], scope='pool3')
  243. net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv4')
  244. net = slim.max_pool2d(net, [2, 2], scope='pool4')
  245. net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv5')
  246. net = slim.max_pool2d(net, [2, 2], scope='pool5')
  247. # Use conv2d instead of fully_connected layers.
  248. net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
  249. net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
  250. scope='dropout6')
  251. net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
  252. # Convert end_points_collection into a end_point dict.
  253. end_points = slim.utils.convert_collection_to_dict(end_points_collection)
  254. if global_pool:
  255. net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
  256. end_points['global_pool'] = net
  257. if num_classes:
  258. net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
  259. scope='dropout7')
  260. net = slim.conv2d(net, num_classes, [1, 1],
  261. activation_fn=None,
  262. normalizer_fn=None,
  263. scope='fc8')
  264. if spatial_squeeze:
  265. net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
  266. end_points[sc.name + '/fc8'] = net
  267. return net, end_points
  268. vgg_19.default_image_size = 224
  269. # Alias
  270. vgg_d = vgg_16
  271. vgg_e = vgg_19