File size: 7,366 Bytes
18ddfe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
syntax = "proto2";

package object_detection.protos;

import "object_detection/protos/hyperparams.proto";

// Configuration proto for box predictor. See core/box_predictor.py for details.
message BoxPredictor {
  oneof box_predictor_oneof {
    ConvolutionalBoxPredictor convolutional_box_predictor = 1;
    MaskRCNNBoxPredictor mask_rcnn_box_predictor = 2;
    RfcnBoxPredictor rfcn_box_predictor = 3;
    WeightSharedConvolutionalBoxPredictor
        weight_shared_convolutional_box_predictor = 4;
  }
}

// Configuration proto for Convolutional box predictor.
// Next id: 13
message ConvolutionalBoxPredictor {
  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 1;

  // Minimum feature depth prior to predicting box encodings and class
  // predictions.
  optional int32 min_depth = 2 [default = 0];

  // Maximum feature depth prior to predicting box encodings and class
  // predictions. If max_depth is set to 0, no additional feature map will be
  // inserted before location and class predictions.
  optional int32 max_depth = 3 [default = 0];

  // Number of the additional conv layers before the predictor.
  optional int32 num_layers_before_predictor = 4 [default = 0];

  // Whether to use dropout for class prediction.
  optional bool use_dropout = 5 [default = true];

  // Keep probability for dropout
  optional float dropout_keep_probability = 6 [default = 0.8];

  // Size of final convolution kernel. If the spatial resolution of the feature
  // map is smaller than the kernel size, then the kernel size is set to
  // min(feature_width, feature_height).
  optional int32 kernel_size = 7 [default = 1];

  // Size of the encoding for boxes.
  optional int32 box_code_size = 8 [default = 4];

  // Whether to apply sigmoid to the output of class predictions.
  // TODO(jonathanhuang): Do we need this since we have a post processing
  // module.?
  optional bool apply_sigmoid_to_scores = 9 [default = false];

  optional float class_prediction_bias_init = 10 [default = 0.0];

  // Whether to use depthwise separable convolution for box predictor layers.
  optional bool use_depthwise = 11 [default = false];

  // If specified, apply clipping to box encodings.
  message BoxEncodingsClipRange {
    optional float min = 1;
    optional float max = 2;
  }
  optional BoxEncodingsClipRange box_encodings_clip_range = 12;
}

// Configuration proto for weight shared convolutional box predictor.
// Next id: 19
message WeightSharedConvolutionalBoxPredictor {
  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 1;

  // Number of the additional conv layers before the predictor.
  optional int32 num_layers_before_predictor = 4 [default = 0];

  // Output depth for the convolution ops prior to predicting box encodings
  // and class predictions.
  optional int32 depth = 2 [default = 0];

  // Size of final convolution kernel. If the spatial resolution of the feature
  // map is smaller than the kernel size, then the kernel size is set to
  // min(feature_width, feature_height).
  optional int32 kernel_size = 7 [default = 3];

  // Size of the encoding for boxes.
  optional int32 box_code_size = 8 [default = 4];

  // Bias initialization for class prediction. It has been show to stabilize
  // training where there are large number of negative boxes. See
  // https://arxiv.org/abs/1708.02002 for details.
  optional float class_prediction_bias_init = 10 [default = 0.0];

  // Whether to use dropout for class prediction.
  optional bool use_dropout = 11 [default = false];

  // Keep probability for dropout.
  optional float dropout_keep_probability = 12 [default = 0.8];

  // Whether to share the multi-layer tower between box prediction and class
  // prediction heads.
  optional bool share_prediction_tower = 13 [default = false];

  // Whether to use depthwise separable convolution for box predictor layers.
  optional bool use_depthwise = 14 [default = false];

  // Enum to specify how to convert the detection scores at inference time.
  enum ScoreConverter {
    // Input scores equals output scores.
    IDENTITY = 0;

    // Applies a sigmoid on input scores.
    SIGMOID = 1;
  }

  // Callable elementwise score converter at inference time.
  optional ScoreConverter score_converter = 16 [default = IDENTITY];

  // If specified, apply clipping to box encodings.
  message BoxEncodingsClipRange {
    optional float min = 1;
    optional float max = 2;
  }
  optional BoxEncodingsClipRange box_encodings_clip_range = 17;

}


// TODO(alirezafathi): Refactor the proto file to be able to configure mask rcnn
// head easily.
// Next id: 15
message MaskRCNNBoxPredictor {
  // Hyperparameters for fully connected ops used in the box predictor.
  optional Hyperparams fc_hyperparams = 1;

  // Whether to use dropout op prior to the both box and class predictions.
  optional bool use_dropout = 2 [default = false];

  // Keep probability for dropout. This is only used if use_dropout is true.
  optional float dropout_keep_probability = 3 [default = 0.5];

  // Size of the encoding for the boxes.
  optional int32 box_code_size = 4 [default = 4];

  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 5;

  // Whether to predict instance masks inside detection boxes.
  optional bool predict_instance_masks = 6 [default = false];

  // The depth for the first conv2d_transpose op applied to the
  // image_features in the mask prediction branch. If set to 0, the value
  // will be set automatically based on the number of channels in the image
  // features and the number of classes.
  optional int32 mask_prediction_conv_depth = 7 [default = 256];

  // Whether to predict keypoints inside detection boxes.
  optional bool predict_keypoints = 8 [default = false];

  // The height and the width of the predicted mask.
  optional int32 mask_height = 9 [default = 15];
  optional int32 mask_width = 10 [default = 15];

  // The number of convolutions applied to image_features in the mask prediction
  // branch.
  optional int32 mask_prediction_num_conv_layers = 11 [default = 2];
  optional bool masks_are_class_agnostic = 12 [default = false];

  // Whether to use one box for all classes rather than a different box for each
  // class.
  optional bool share_box_across_classes = 13 [default = false];

  // Whether to apply convolutions on mask features before upsampling using
  // nearest neighbor resizing.
  // By default, mask features are resized to [`mask_height`, `mask_width`]
  // before applying convolutions and predicting masks.
  optional bool convolve_then_upsample_masks = 14 [default = false];
}

message RfcnBoxPredictor {
  // Hyperparameters for convolution ops used in the box predictor.
  optional Hyperparams conv_hyperparams = 1;

  // Bin sizes for RFCN crops.
  optional int32 num_spatial_bins_height = 2 [default = 3];

  optional int32 num_spatial_bins_width = 3 [default = 3];

  // Target depth to reduce the input image features to.
  optional int32 depth = 4 [default = 1024];

  // Size of the encoding for the boxes.
  optional int32 box_code_size = 5 [default = 4];

  // Size to resize the rfcn crops to.
  optional int32 crop_height = 6 [default = 12];

  optional int32 crop_width = 7 [default = 12];
}