Spaces:

deanna-emery
/

ASL-MoViNet-T5-translator

Runtime error

App Files Files Community

ASL-MoViNet-T5-translator / official /vision /modeling /maskrcnn_model.py

deanna-emery

updates

93528c6 over 1 year ago

raw

history blame

20.7 kB

	# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""R-CNN(-RS) models."""

	from typing import Any, List, Mapping, Optional, Tuple, Union

	import tensorflow as tf, tf_keras

	from official.vision.ops import anchor
	from official.vision.ops import box_ops


	@tf_keras.utils.register_keras_serializable(package='Vision')
	class MaskRCNNModel(tf_keras.Model):
	"""The Mask R-CNN(-RS) and Cascade RCNN-RS models."""

	def __init__(self,
	backbone: tf_keras.Model,
	decoder: tf_keras.Model,
	rpn_head: tf_keras.layers.Layer,
	detection_head: Union[tf_keras.layers.Layer,
	List[tf_keras.layers.Layer]],
	roi_generator: tf_keras.layers.Layer,
	roi_sampler: Union[tf_keras.layers.Layer,
	List[tf_keras.layers.Layer]],
	roi_aligner: tf_keras.layers.Layer,
	detection_generator: tf_keras.layers.Layer,
	mask_head: Optional[tf_keras.layers.Layer] = None,
	mask_sampler: Optional[tf_keras.layers.Layer] = None,
	mask_roi_aligner: Optional[tf_keras.layers.Layer] = None,
	class_agnostic_bbox_pred: bool = False,
	cascade_class_ensemble: bool = False,
	min_level: Optional[int] = None,
	max_level: Optional[int] = None,
	num_scales: Optional[int] = None,
	aspect_ratios: Optional[List[float]] = None,
	anchor_size: Optional[float] = None,
	outer_boxes_scale: float = 1.0,
	**kwargs):
	"""Initializes the R-CNN(-RS) model.

	Args:
	backbone: `tf_keras.Model`, the backbone network.
	decoder: `tf_keras.Model`, the decoder network.
	rpn_head: the RPN head.
	detection_head: the detection head or a list of heads.
	roi_generator: the ROI generator.
	roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
	detection heads.
	roi_aligner: the ROI aligner.
	detection_generator: the detection generator.
	mask_head: the mask head.
	mask_sampler: the mask sampler.
	mask_roi_aligner: the ROI alginer for mask prediction.
	class_agnostic_bbox_pred: if True, perform class agnostic bounding box
	prediction. Needs to be `True` for Cascade RCNN models.
	cascade_class_ensemble: if True, ensemble classification scores over all
	detection heads.
	min_level: Minimum level in output feature maps.
	max_level: Maximum level in output feature maps.
	num_scales: A number representing intermediate scales added on each level.
	For instances, num_scales=2 adds one additional intermediate anchor
	scales [2^0, 2^0.5] on each level.
	aspect_ratios: A list representing the aspect raito anchors added on each
	level. The number indicates the ratio of width to height. For instances,
	aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
	anchor_size: A number representing the scale of size of the base anchor to
	the feature stride 2^level.
	outer_boxes_scale: a float to scale up the bounding boxes to generate
	more inclusive masks. The scale is expected to be >=1.0.
	**kwargs: keyword arguments to be passed.
	"""
	super().__init__(**kwargs)
	self._config_dict = {
	'backbone': backbone,
	'decoder': decoder,
	'rpn_head': rpn_head,
	'detection_head': detection_head,
	'roi_generator': roi_generator,
	'roi_sampler': roi_sampler,
	'roi_aligner': roi_aligner,
	'detection_generator': detection_generator,
	'outer_boxes_scale': outer_boxes_scale,
	'mask_head': mask_head,
	'mask_sampler': mask_sampler,
	'mask_roi_aligner': mask_roi_aligner,
	'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
	'cascade_class_ensemble': cascade_class_ensemble,
	'min_level': min_level,
	'max_level': max_level,
	'num_scales': num_scales,
	'aspect_ratios': aspect_ratios,
	'anchor_size': anchor_size,
	}
	self.backbone = backbone
	self.decoder = decoder
	self.rpn_head = rpn_head
	if not isinstance(detection_head, (list, tuple)):
	self.detection_head = [detection_head]
	else:
	self.detection_head = detection_head
	self.roi_generator = roi_generator
	if not isinstance(roi_sampler, (list, tuple)):
	self.roi_sampler = [roi_sampler]
	else:
	self.roi_sampler = roi_sampler
	if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
	raise ValueError(
	'`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
	)
	self.roi_aligner = roi_aligner
	self.detection_generator = detection_generator
	self._include_mask = mask_head is not None
	if outer_boxes_scale < 1.0:
	raise ValueError('`outer_boxes_scale` should be a value >= 1.0.')
	self.outer_boxes_scale = outer_boxes_scale
	self.mask_head = mask_head
	if self._include_mask and mask_sampler is None:
	raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
	self.mask_sampler = mask_sampler
	if self._include_mask and mask_roi_aligner is None:
	raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
	self.mask_roi_aligner = mask_roi_aligner
	# Weights for the regression losses for each FRCNN layer.
	# TODO(jiageng): Make the weights configurable.
	self._cascade_layer_to_weights = [
	[10.0, 10.0, 5.0, 5.0],
	[20.0, 20.0, 10.0, 10.0],
	[30.0, 30.0, 15.0, 15.0],
	]

	def call( # pytype: disable=signature-mismatch # overriding-parameter-count-checks
	self,
	images: tf.Tensor,
	image_shape: tf.Tensor,
	anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
	gt_boxes: Optional[tf.Tensor] = None,
	gt_classes: Optional[tf.Tensor] = None,
	gt_masks: Optional[tf.Tensor] = None,
	gt_outer_boxes: Optional[tf.Tensor] = None,
	training: Optional[bool] = None) -> Mapping[str, Optional[tf.Tensor]]:
	call_box_outputs_kwargs = {
	'images': images,
	'image_shape': image_shape,
	'anchor_boxes': anchor_boxes,
	'gt_boxes': gt_boxes,
	'gt_classes': gt_classes,
	'training': training,
	}
	if self.outer_boxes_scale > 1.0:
	call_box_outputs_kwargs['gt_outer_boxes'] = gt_outer_boxes
	model_outputs, intermediate_outputs = self._call_box_outputs(
	**call_box_outputs_kwargs)
	if not self._include_mask:
	return model_outputs

	if self.outer_boxes_scale == 1.0:
	current_rois = intermediate_outputs['current_rois']
	matched_gt_boxes = intermediate_outputs['matched_gt_boxes']
	else:
	current_rois = box_ops.compute_outer_boxes(
	intermediate_outputs['current_rois'],
	tf.expand_dims(image_shape, axis=1), self.outer_boxes_scale)
	matched_gt_boxes = intermediate_outputs['matched_gt_outer_boxes']

	model_mask_outputs = self._call_mask_outputs(
	model_box_outputs=model_outputs,
	features=model_outputs['decoder_features'],
	current_rois=current_rois,
	matched_gt_indices=intermediate_outputs['matched_gt_indices'],
	matched_gt_boxes=matched_gt_boxes,
	matched_gt_classes=intermediate_outputs['matched_gt_classes'],
	gt_masks=gt_masks,
	training=training)
	model_outputs.update(model_mask_outputs) # pytype: disable=attribute-error # dynamic-method-lookup
	return model_outputs

	def _get_backbone_and_decoder_features(self, images):

	backbone_features = self.backbone(images)
	if self.decoder:
	features = self.decoder(backbone_features)
	else:
	features = backbone_features
	return backbone_features, features

	def _call_box_outputs(
	self,
	images: tf.Tensor,
	image_shape: tf.Tensor,
	anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
	gt_boxes: Optional[tf.Tensor] = None,
	gt_classes: Optional[tf.Tensor] = None,
	training: Optional[bool] = None,
	gt_outer_boxes: Optional[tf.Tensor] = None,
	) -> Tuple[Mapping[str, Any], Mapping[str, Any]]:
	"""Implementation of the Faster-RCNN logic for boxes."""
	model_outputs = {}

	# Feature extraction.
	(backbone_features,
	decoder_features) = self._get_backbone_and_decoder_features(images)

	# Region proposal network.
	rpn_scores, rpn_boxes = self.rpn_head(decoder_features)

	model_outputs.update({
	'backbone_features': backbone_features,
	'decoder_features': decoder_features,
	'rpn_boxes': rpn_boxes,
	'rpn_scores': rpn_scores
	})

	# Generate anchor boxes for this batch if not provided.
	if anchor_boxes is None:
	_, image_height, image_width, _ = images.get_shape().as_list()
	anchor_boxes = anchor.Anchor(
	min_level=self._config_dict['min_level'],
	max_level=self._config_dict['max_level'],
	num_scales=self._config_dict['num_scales'],
	aspect_ratios=self._config_dict['aspect_ratios'],
	anchor_size=self._config_dict['anchor_size'],
	image_size=(image_height, image_width)).multilevel_boxes
	for l in anchor_boxes:
	anchor_boxes[l] = tf.tile(
	tf.expand_dims(anchor_boxes[l], axis=0),
	[tf.shape(images)[0], 1, 1, 1])

	# Generate RoIs.
	current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
	image_shape, training)

	next_rois = current_rois
	all_class_outputs = []
	for cascade_num in range(len(self.roi_sampler)):
	# In cascade RCNN we want the higher layers to have different regression
	# weights as the predicted deltas become smaller and smaller.
	regression_weights = self._cascade_layer_to_weights[cascade_num]
	current_rois = next_rois

	if self.outer_boxes_scale == 1.0:
	(class_outputs, box_outputs, model_outputs, matched_gt_boxes,
	matched_gt_classes, matched_gt_indices,
	current_rois) = self._run_frcnn_head(
	features=decoder_features,
	rois=current_rois,
	gt_boxes=gt_boxes,
	gt_classes=gt_classes,
	training=training,
	model_outputs=model_outputs,
	cascade_num=cascade_num,
	regression_weights=regression_weights)
	else:
	(class_outputs, box_outputs, model_outputs,
	(matched_gt_boxes, matched_gt_outer_boxes), matched_gt_classes,
	matched_gt_indices, current_rois) = self._run_frcnn_head(
	features=decoder_features,
	rois=current_rois,
	gt_boxes=gt_boxes,
	gt_outer_boxes=gt_outer_boxes,
	gt_classes=gt_classes,
	training=training,
	model_outputs=model_outputs,
	cascade_num=cascade_num,
	regression_weights=regression_weights)
	all_class_outputs.append(class_outputs)

	# Generate ROIs for the next cascade head if there is any.
	if cascade_num < len(self.roi_sampler) - 1:
	next_rois = box_ops.decode_boxes(
	tf.cast(box_outputs, tf.float32),
	current_rois,
	weights=regression_weights)
	next_rois = box_ops.clip_boxes(next_rois,
	tf.expand_dims(image_shape, axis=1))

	if not training:
	if self._config_dict['cascade_class_ensemble']:
	class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)

	detections = self.detection_generator(
	box_outputs,
	class_outputs,
	current_rois,
	image_shape,
	regression_weights,
	bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
	model_outputs.update({
	'cls_outputs': class_outputs,
	'box_outputs': box_outputs,
	})
	if self.detection_generator.get_config()['apply_nms']:
	model_outputs.update({
	'detection_boxes': detections['detection_boxes'],
	'detection_scores': detections['detection_scores'],
	'detection_classes': detections['detection_classes'],
	'num_detections': detections['num_detections']
	})
	if self.outer_boxes_scale > 1.0:
	detection_outer_boxes = box_ops.compute_outer_boxes(
	detections['detection_boxes'],
	tf.expand_dims(image_shape, axis=1), self.outer_boxes_scale)
	model_outputs['detection_outer_boxes'] = detection_outer_boxes
	else:
	model_outputs.update({
	'decoded_boxes': detections['decoded_boxes'],
	'decoded_box_scores': detections['decoded_box_scores']
	})

	intermediate_outputs = {
	'matched_gt_boxes': matched_gt_boxes,
	'matched_gt_indices': matched_gt_indices,
	'matched_gt_classes': matched_gt_classes,
	'current_rois': current_rois,
	}
	if self.outer_boxes_scale > 1.0:
	intermediate_outputs['matched_gt_outer_boxes'] = matched_gt_outer_boxes
	return (model_outputs, intermediate_outputs)

	def _call_mask_outputs(
	self,
	model_box_outputs: Mapping[str, tf.Tensor],
	features: tf.Tensor,
	current_rois: tf.Tensor,
	matched_gt_indices: tf.Tensor,
	matched_gt_boxes: tf.Tensor,
	matched_gt_classes: tf.Tensor,
	gt_masks: tf.Tensor,
	training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
	"""Implementation of Mask-RCNN mask prediction logic."""

	model_outputs = dict(model_box_outputs)
	if training:
	current_rois, roi_classes, roi_masks = self.mask_sampler(
	current_rois, matched_gt_boxes, matched_gt_classes,
	matched_gt_indices, gt_masks)
	roi_masks = tf.stop_gradient(roi_masks)

	model_outputs.update({
	'mask_class_targets': roi_classes,
	'mask_targets': roi_masks,
	})
	else:
	if self.outer_boxes_scale == 1.0:
	current_rois = model_outputs['detection_boxes']
	else:
	current_rois = model_outputs['detection_outer_boxes']

	roi_classes = model_outputs['detection_classes']

	mask_logits, mask_probs = self._features_to_mask_outputs(
	features, current_rois, roi_classes)

	if training:
	model_outputs.update({
	'mask_outputs': mask_logits,
	})
	else:
	model_outputs.update({
	'detection_masks': mask_probs,
	})
	return model_outputs

	def _run_frcnn_head(self,
	features,
	rois,
	gt_boxes,
	gt_classes,
	training,
	model_outputs,
	cascade_num,
	regression_weights,
	gt_outer_boxes=None):
	"""Runs the frcnn head that does both class and box prediction.

	Args:
	features: `list` of features from the feature extractor.
	rois: `list` of current rois that will be used to predict bbox refinement
	and classes from.
	gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
	This tensor might have paddings with a negative value.
	gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
	classes. It is padded with -1s to indicate the invalid classes.
	training: `bool`, if model is training or being evaluated.
	model_outputs: `dict`, used for storing outputs used for eval and losses.
	cascade_num: `int`, the current frcnn layer in the cascade.
	regression_weights: `list`, weights used for l1 loss in bounding box
	regression.
	gt_outer_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES,
	4]. This tensor might have paddings with a negative value. Default to
	None.

	Returns:
	class_outputs: Class predictions for rois.
	box_outputs: Box predictions for rois. These are formatted for the
	regression loss and need to be converted before being used as rois
	in the next stage.
	model_outputs: Updated dict with predictions used for losses and eval.
	matched_gt_boxes: If `is_training` is true, then these give the gt box
	location of its positive match.
	matched_gt_classes: If `is_training` is true, then these give the gt class
	of the predicted box.
	matched_gt_boxes: If `is_training` is true, then these give the box
	location of its positive match.
	matched_gt_outer_boxes: If `is_training` is true, then these give the
	outer box location of its positive match. Only exist if
	outer_boxes_scale is greater than 1.0.
	matched_gt_indices: If `is_training` is true, then gives the index of
	the positive box match. Used for mask prediction.
	rois: The sampled rois used for this layer.
	"""
	# Only used during training.
	matched_gt_boxes, matched_gt_classes, matched_gt_indices = None, None, None
	if self.outer_boxes_scale > 1.0:
	matched_gt_outer_boxes = None

	if training and gt_boxes is not None:
	rois = tf.stop_gradient(rois)

	current_roi_sampler = self.roi_sampler[cascade_num]
	if self.outer_boxes_scale == 1.0:
	rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
	current_roi_sampler(rois, gt_boxes, gt_classes))
	else:
	(rois, matched_gt_boxes, matched_gt_outer_boxes, matched_gt_classes,
	matched_gt_indices) = current_roi_sampler(rois, gt_boxes, gt_classes,
	gt_outer_boxes)
	# Create bounding box training targets.
	box_targets = box_ops.encode_boxes(
	matched_gt_boxes, rois, weights=regression_weights)
	# If the target is background, the box target is set to all 0s.
	box_targets = tf.where(
	tf.tile(
	tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
	[1, 1, 4]), tf.zeros_like(box_targets), box_targets)
	model_outputs.update({
	'class_targets_{}'.format(cascade_num)
	if cascade_num else 'class_targets':
	matched_gt_classes,
	'box_targets_{}'.format(cascade_num)
	if cascade_num else 'box_targets':
	box_targets,
	})

	# Get roi features.
	roi_features = self.roi_aligner(features, rois)

	# Run frcnn head to get class and bbox predictions.
	current_detection_head = self.detection_head[cascade_num]
	class_outputs, box_outputs = current_detection_head(roi_features)

	model_outputs.update({
	'class_outputs_{}'.format(cascade_num)
	if cascade_num else 'class_outputs':
	class_outputs,
	'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
	box_outputs,
	})
	if self.outer_boxes_scale == 1.0:
	return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
	matched_gt_classes, matched_gt_indices, rois)
	else:
	return (class_outputs, box_outputs, model_outputs,
	(matched_gt_boxes, matched_gt_outer_boxes), matched_gt_classes,
	matched_gt_indices, rois)

	def _features_to_mask_outputs(self, features, rois, roi_classes):
	# Mask RoI align.
	mask_roi_features = self.mask_roi_aligner(features, rois)

	# Mask head.
	raw_masks = self.mask_head([mask_roi_features, roi_classes])

	return raw_masks, tf.nn.sigmoid(raw_masks)

	@property
	def checkpoint_items(
	self) -> Mapping[str, Union[tf_keras.Model, tf_keras.layers.Layer]]:
	"""Returns a dictionary of items to be additionally checkpointed."""
	items = dict(
	backbone=self.backbone,
	rpn_head=self.rpn_head,
	detection_head=self.detection_head)
	if self.decoder is not None:
	items.update(decoder=self.decoder)
	if self._include_mask:
	items.update(mask_head=self.mask_head)

	return items

	def get_config(self) -> Mapping[str, Any]:
	return self._config_dict

	@classmethod
	def from_config(cls, config):
	return cls(**config)