from typing import Dict, List, Optional from torch import nn, Tensor from .model.multimae import generate_smultimae_model as generate_smultimae_model_v1 from .configs.base_config import base_cfg class RGBDModel(nn.Module): def __init__(self, cfg: base_cfg): super(RGBDModel, self).__init__() self.inputs = cfg.inputs self.outputs = cfg.outputs self.is_no_depth = cfg.is_inference_with_no_depth if cfg.model_version == 1: self.model, self.opt_params = generate_smultimae_model_v1(cfg) else: raise Exception(f"Unsupported model version {cfg.model_version}") def encode_decode( self, images: Tensor, depths: Optional[Tensor], gt_index_lst: Optional[List[int]] = None, max_gts_lst: Optional[List[int]] = None, ) -> Dict[str, Tensor]: """Encode images with backbone and decode into a semantic segmentation map of the same size as input. Returns: { "sod": Tensor, "depth": Optional[Tensor], "rgb": Optional[tensor], } """ inputs = {"rgb": images} if "depth" in self.inputs: inputs["depth"] = depths return self.model.forward(inputs, gt_index_lst, max_gts_lst) def forward( self, images: Tensor, depths: Optional[Tensor], gt_index_lst: Optional[List[int]] = None, max_gts_lst: Optional[List[int]] = None, ) -> Dict[str, Tensor]: return self.encode_decode(images, depths, gt_index_lst, max_gts_lst) def inference( self, images: Tensor, depths: Optional[Tensor], gt_index_lst: Optional[List[int]] = None, max_gts_lst: Optional[List[int]] = None, ) -> Dict[str, Tensor]: return self.encode_decode(images, depths, gt_index_lst, max_gts_lst)