syntax = "proto2"; | |
package object_detection.protos; | |
import "object_detection/protos/image_resizer.proto"; | |
import "object_detection/protos/losses.proto"; | |
// Configuration for the CenterNet meta architecture from the "Objects as | |
// Points" paper [1] | |
// [1]: https://arxiv.org/abs/1904.07850 | |
message CenterNet { | |
// Number of classes to predict. | |
optional int32 num_classes = 1; | |
// Feature extractor config. | |
optional CenterNetFeatureExtractor feature_extractor = 2; | |
// Image resizer for preprocessing the input image. | |
optional ImageResizer image_resizer = 3; | |
// Parameters which are related to object detection task. | |
message ObjectDetection { | |
// The original fields are moved to ObjectCenterParams or deleted. | |
reserved 2, 5, 6, 7; | |
// Weight of the task loss. The total loss of the model will be the | |
// summation of task losses weighted by the weights. | |
optional float task_loss_weight = 1 [default = 1.0]; | |
// Weight for the offset localization loss. | |
optional float offset_loss_weight = 3 [default = 1.0]; | |
// Weight for the height/width localization loss. | |
optional float scale_loss_weight = 4 [default = 0.1]; | |
// Localization loss configuration for object scale and offset losses. | |
optional LocalizationLoss localization_loss = 8; | |
} | |
optional ObjectDetection object_detection_task = 4; | |
// Parameters related to object center prediction. This is required for both | |
// object detection and keypoint estimation tasks. | |
message ObjectCenterParams { | |
// Weight for the object center loss. | |
optional float object_center_loss_weight = 1 [default = 1.0]; | |
// Classification loss configuration for object center loss. | |
optional ClassificationLoss classification_loss = 2; | |
// The initial bias value of the convlution kernel of the class heatmap | |
// prediction head. -2.19 corresponds to predicting foreground with | |
// a probability of 0.1. See "Focal Loss for Dense Object Detection" | |
// at https://arxiv.org/abs/1708.02002. | |
optional float heatmap_bias_init = 3 [default = -2.19]; | |
// The minimum IOU overlap boxes need to have to not be penalized. | |
optional float min_box_overlap_iou = 4 [default = 0.7]; | |
// Maximum number of boxes to predict. | |
optional int32 max_box_predictions = 5 [default = 100]; | |
// If set, loss is only computed for the labeled classes. | |
optional bool use_labeled_classes = 6 [default = false]; | |
} | |
optional ObjectCenterParams object_center_params = 5; | |
// Path of the file that conatins the label map along with the keypoint | |
// information, including the keypoint indices, corresponding labels, and the | |
// corresponding class. The file should be the same one as used in the input | |
// pipeline. Note that a plain text of StringIntLabelMap proto is expected in | |
// this file. | |
// It is required only if the keypoint estimation task is specified. | |
optional string keypoint_label_map_path = 6; | |
// Parameters which are related to keypoint estimation task. | |
message KeypointEstimation { | |
// Name of the task, e.g. "human pose". Note that the task name should be | |
// unique to each keypoint task. | |
optional string task_name = 1; | |
// Weight of the task loss. The total loss of the model will be their | |
// summation of task losses weighted by the weights. | |
optional float task_loss_weight = 2 [default = 1.0]; | |
// Loss configuration for keypoint heatmap, offset, regression losses. Note | |
// that the localization loss is used for offset/regression losses and | |
// classification loss is used for heatmap loss. | |
optional Loss loss = 3; | |
// The name of the class that contains the keypoints for this task. This is | |
// used to retrieve the corresponding keypoint indices from the label map. | |
// Note that this corresponds to the "name" field, not "display_name". | |
optional string keypoint_class_name = 4; | |
// The standard deviation of the Gaussian kernel used to generate the | |
// keypoint heatmap. The unit is the pixel in the output image. It is to | |
// provide the flexibility of using different sizes of Gaussian kernel for | |
// each keypoint class. Note that if provided, the keypoint standard | |
// deviations will be overridden by the specified values here, otherwise, | |
// the default value 5.0 will be used. | |
// TODO(yuhuic): Update the default value once we found the best value. | |
map<string, float> keypoint_label_to_std = 5; | |
// Loss weights corresponding to different heads. | |
optional float keypoint_regression_loss_weight = 6 [default = 1.0]; | |
optional float keypoint_heatmap_loss_weight = 7 [default = 1.0]; | |
optional float keypoint_offset_loss_weight = 8 [default = 1.0]; | |
// The initial bias value of the convolution kernel of the keypoint heatmap | |
// prediction head. -2.19 corresponds to predicting foreground with | |
// a probability of 0.1. See "Focal Loss for Dense Object Detection" | |
// at https://arxiv.org/abs/1708.02002. | |
optional float heatmap_bias_init = 9 [default = -2.19]; | |
// The heatmap score threshold for a keypoint to become a valid candidate. | |
optional float keypoint_candidate_score_threshold = 10 [default = 0.1]; | |
// The maximum number of candidates to retrieve for each keypoint. | |
optional int32 num_candidates_per_keypoint = 11 [default = 100]; | |
// Max pool kernel size to use to pull off peak score locations in a | |
// neighborhood (independently for each keypoint types). | |
optional int32 peak_max_pool_kernel_size = 12 [default = 3]; | |
// The default score to use for regressed keypoints that are not | |
// successfully snapped to a nearby candidate. | |
optional float unmatched_keypoint_score = 13 [default = 0.1]; | |
// The multiplier to expand the bounding boxes (either the provided boxes or | |
// those which tightly cover the regressed keypoints). Note that new | |
// expanded box for an instance becomes the feasible search window for all | |
// associated keypoints. | |
optional float box_scale = 14 [default = 1.2]; | |
// The scale parameter that multiplies the largest dimension of a bounding | |
// box. The resulting distance becomes a search radius for candidates in the | |
// vicinity of each regressed keypoint. | |
optional float candidate_search_scale = 15 [default = 0.3]; | |
// One of ['min_distance', 'score_distance_ratio'] indicating how to select | |
// the keypoint candidate. | |
optional string candidate_ranking_mode = 16 [default = "min_distance"]; | |
// The radius (in the unit of output pixel) around heatmap peak to assign | |
// the offset targets. If set 0, then the offset target will only be | |
// assigned to the heatmap peak (same behavior as the original paper). | |
optional int32 offset_peak_radius = 17 [default = 0]; | |
// Indicates whether to assign offsets for each keypoint channel | |
// separately. If set False, the output offset target has the shape | |
// [batch_size, out_height, out_width, 2] (same behavior as the original | |
// paper). If set True, the output offset target has the shape [batch_size, | |
// out_height, out_width, 2 * num_keypoints] (recommended when the | |
// offset_peak_radius is not zero). | |
optional bool per_keypoint_offset = 18 [default = false]; | |
} | |
repeated KeypointEstimation keypoint_estimation_task = 7; | |
// Parameters which are related to mask estimation task. | |
// Note: Currently, CenterNet supports a weak instance segmentation, where | |
// semantic segmentation masks are estimated, and then cropped based on | |
// bounding box detections. Therefore, it is possible for the same image | |
// pixel to be assigned to multiple instances. | |
message MaskEstimation { | |
// Weight of the task loss. The total loss of the model will be their | |
// summation of task losses weighted by the weights. | |
optional float task_loss_weight = 1 [default = 1.0]; | |
// Classification loss configuration for segmentation loss. | |
optional ClassificationLoss classification_loss = 2; | |
// Each instance mask (one per detection) is cropped and resized (bilinear | |
// resampling) from the predicted segmentation feature map. After | |
// resampling, the masks are binarized with the provided score threshold. | |
optional int32 mask_height = 4 [default = 256]; | |
optional int32 mask_width = 5 [default = 256]; | |
optional float score_threshold = 6 [default = 0.5]; | |
// The initial bias value of the convlution kernel of the class heatmap | |
// prediction head. -2.19 corresponds to predicting foreground with | |
// a probability of 0.1. | |
optional float heatmap_bias_init = 3 [default = -2.19]; | |
} | |
optional MaskEstimation mask_estimation_task = 8; | |
} | |
message CenterNetFeatureExtractor { | |
optional string type = 1; | |
// Channel means to be subtracted from each image channel. If not specified, | |
// we use a default value of 0. | |
repeated float channel_means = 2; | |
// Channel standard deviations. Each channel will be normalized by dividing | |
// it by its standard deviation. If not specified, we use a default value | |
// of 1. | |
repeated float channel_stds = 3; | |
// If set, will change channel order to be [blue, green, red]. This can be | |
// useful to be compatible with some pre-trained feature extractors. | |
optional bool bgr_ordering = 4 [default = false]; | |
} | |