dataset: use_epochs: false num_workers: 4 batch_size: ${experiment.batch_size_per_gpu} _target_: ocl.datasets.WebdatasetDataModule train_shards: ${oc.env:DATASET_PREFIX}/vg/train/shard-{000000..000303}.tar train_size: 118287 val_shards: ${oc.env:DATASET_PREFIX}/vg/val/shard-{000000..000037}.tar val_size: 5000 test_shards: ${oc.env:DATASET_PREFIX}/vg/test/shard-{000000..000037}.tar test_size: 40670 use_autopadding: true eval_transforms: 03a_preprocessing: _target_: ocl.transforms.Map transform: _target_: torchvision.transforms.Compose transforms: - _target_: ocl.preprocessing.CopyFields mapping: instance_mask: instance_mask_v2 - _target_: ocl.preprocessing.SelectConditioningInfoVG num_max_binds: ${experiment.num_slots} num_slots: ${experiment.num_slots} fields: - image - instance_mask - instance_category - instance_iscrowd - name - bbox_centroids - name_embedding - selected_indices - contrastive_loss_mask - all_bbox_centroids - all_names - references - tokens batch_transform: false 03c_preprocessing: _target_: ocl.transforms.SimpleTransform transforms: image: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda image: image.copy()''}' - _target_: torchvision.transforms.v2.ToImage - _target_: torchvision.transforms.v2.ToDtype dtype: ${torch_dtype:float32} scale: true - _target_: torchvision.transforms.v2.Normalize mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 instance_mask: _target_: torchvision.transforms.Compose transforms: - _target_: ocl.preprocessing.IntegerToOneHotMask output_axis: -3 - _target_: ocl.preprocessing.AddEmptyMasksVG - _target_: ocl.preprocessing.DenseMaskToTensor instance_mask_v2: _target_: torchvision.transforms.Compose transforms: - _target_: ocl.preprocessing.IntegerToOneHotMask output_axis: -3 - _target_: ocl.preprocessing.AddEmptyMasksVG - _target_: ocl.preprocessing.DenseMaskToTensor batch_transform: false train_transforms: 03a_preprocessing: _target_: ocl.transforms.Map transform: _target_: torchvision.transforms.Compose transforms: - _target_: ocl.preprocessing.CopyFields mapping: instance_mask: instance_mask_v2 - _target_: ocl.preprocessing.SelectConditioningInfoVG num_max_binds: ${experiment.num_slots} num_slots: ${experiment.num_slots} fields: - image - instance_mask - instance_category - instance_iscrowd - name - bbox_centroids - name_embedding - selected_indices - contrastive_loss_mask - all_names - references - tokens batch_transform: false 03b_preprocessing: _target_: ocl.transforms.SimpleTransform transforms: image: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda image: image.copy()''}' - _target_: torchvision.transforms.v2.ToImage - _target_: torchvision.transforms.v2.ToDtype dtype: ${torch_dtype:float32} scale: true - _target_: torchvision.transforms.v2.Normalize mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 name_embedding: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}' - _target_: ocl.preprocessing.ToTensor bbox_centroids: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}' - _target_: ocl.preprocessing.ToTensor all_bbox_centroids: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}' - _target_: ocl.preprocessing.ToTensor selected_indices: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}' - _target_: ocl.preprocessing.ToTensor contrastive_loss_mask: _target_: torchvision.transforms.Compose transforms: - '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}' - _target_: ocl.preprocessing.ToTensor instance_mask: _target_: torchvision.transforms.Compose transforms: - _target_: ocl.preprocessing.IntegerToOneHotMask output_axis: -3 - _target_: ocl.preprocessing.AddEmptyMasksVG - _target_: ocl.preprocessing.DenseMaskToTensor instance_mask_v2: _target_: torchvision.transforms.Compose transforms: - _target_: ocl.preprocessing.IntegerToOneHotMask output_axis: -3 - _target_: ocl.preprocessing.AddEmptyMasksVG - _target_: ocl.preprocessing.DenseMaskToTensor batch_transform: false models: feature_extractor: _target_: routed.ocl.feature_extractors.TimmFeatureExtractor model_name: ${experiment.timm_model} pretrained: ${when_testing:false,true} freeze: true feature_level: 12 video_path: input.image dynamic_img_size: true mapping: _target_: routed.ocl.mapping.MLPMapping dim: ${experiment.feature_dim} x_path: feature_extractor conditioning: _target_: routed.ocl.conditioning.LangConditioning n_slots: ${experiment.num_slots} object_dim: ${experiment.slot_dim} dual_conditioning: false name_embedding_path: input.name_embedding batch_size_path: input.batch_size mask_path: input.contrastive_loss_mask perceptual_grouping: _target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping feature_dim: ${.object_dim} object_dim: ${experiment.slot_dim} use_projection_bias: false positional_embedding: _target_: ocl.neural_networks.wrappers.Sequential _args_: - _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed - _target_: ocl.neural_networks.build_two_layer_mlp input_dim: ${experiment.feature_dim} output_dim: ${....feature_dim} hidden_dim: '${mul: ${experiment.feature_dim}, 2}' initial_layer_norm: true ff_mlp: _target_: ocl.neural_networks.build_two_layer_mlp input_dim: ${..object_dim} output_dim: ${..object_dim} hidden_dim: '${mul: ${..object_dim}, 4}' initial_layer_norm: true residual: true feature_path: mapping conditioning_path: conditioning attn_aggregation: _target_: routed.ocl.heads.AttentionAggregationHead dim: ${experiment.feature_dim} attn_path: perceptual_grouping.feature_attributions x_path: mapping.features projector_slots: _target_: routed.ocl.heads.SlotProjectorHead dim: ${experiment.feature_dim} embedding_dim: 4096 slots_path: attn_aggregation lang_embedding: _target_: routed.ocl.heads.LangEmbeddingHead embedding_dim: 4096 name_embedding_path: input.name_embedding point_embedding: _target_: routed.ocl.heads.PointEmbeddingHead embedding_dim: 4096 point_embedding_path: input.bbox_centroids dec_conditioning: _target_: routed.ocl.decoder_conditioning.EncodeLangConditioning dim: ${experiment.slot_dim} language_path: input.name_embedding mask_path: input.contrastive_loss_mask object_decoder: _target_: routed.ocl.decoding.PatchDecoder decoder: _target_: ocl.neural_networks.build_mlp _partial_: true features: - 2048 - 2048 - 2048 object_dim: ${experiment.slot_dim} output_dim: ${experiment.feature_dim} num_patches: ${experiment.num_patches} object_features_path: perceptual_grouping.objects image_path: input.image conditioned: true condition_info_path: dec_conditioning optimizers: opt0: _target_: ocl.optimization.OptimizationWrapper optimizer: _target_: torch.optim.AdamW _partial_: true lr: ${experiment.total_lr} lr_scheduler: _target_: ocl.scheduling.exponential_decay_after_optional_warmup _partial_: true decay_rate: 0.5 decay_steps: 100000 warmup_steps: 10000 parameter_groups: _target_: ocl.optimization.ParameterGroupCreator param_groups: grouping: params: - models.perceptual_grouping - models.conditioning - models.object_decoder - models.dec_conditioning lr: ${experiment.total_lr} weight_decay: 0.0 encoder: params: - models.mapping - models.lang_embedding - models.point_embedding - models.attn_aggregation - models.projector_slots lr: ${experiment.mapping_lr} weight_decay: 0.0 losses: mse: _target_: routed.ocl.losses.ReconstructionLoss loss_type: mse input_path: object_decoder.reconstruction target_path: feature_extractor.features contrastive_loss_lang: _target_: routed.ocl.losses.DiagonalContrastiveLoss x1_path: projector_slots x2_path: lang_embedding contrastive_loss_mask_path: input.contrastive_loss_mask temp: 0.1 batch_contrastive: true weight: 0.2 contrastive_loss_point: _target_: routed.ocl.losses.DiagonalContrastiveLoss x1_path: projector_slots x2_path: point_embedding contrastive_loss_mask_path: input.contrastive_loss_mask temp: 0.1 batch_contrastive: true weight: 0.2 visualizations: input: _target_: routed.ocl.visualizations.Image n_instances: 32 denormalization: _target_: ocl.preprocessing.Denormalize mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 image_path: input.image masks: _target_: routed.ocl.visualizations.Mask mask_path: object_decoder.masks_as_image pred_segmentation: _target_: routed.ocl.visualizations.Segmentation denormalization: _target_: ocl.preprocessing.Denormalize mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 image_path: input.image mask_path: object_decoder.masks_as_image pred_segmentation_with_text: _target_: routed.ocl.visualizations.SegmentationWithText n_instances: 32 denormalization: _target_: ocl.preprocessing.Denormalize mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 image_path: input.image mask_path: object_decoder.masks_as_image gt_masks_path: input.instance_mask_v2 selected_indices_path: input.selected_indices text_path: input.name bbox_centroids_path: input.all_bbox_centroids trainer: _target_: pytorch_lightning.trainer.trainer.Trainer accelerator: auto strategy: auto devices: 1 num_nodes: 1 precision: null logger: null callbacks: ${oc.dict.values:experiment.callbacks} fast_dev_run: false max_epochs: -1 min_epochs: null max_steps: 500000 min_steps: null max_time: null limit_train_batches: null limit_val_batches: null limit_test_batches: null limit_predict_batches: null overfit_batches: 0.0 val_check_interval: 5000 check_val_every_n_epoch: null num_sanity_val_steps: null log_every_n_steps: 100 enable_checkpointing: null enable_progress_bar: null enable_model_summary: null accumulate_grad_batches: 1 gradient_clip_val: 1.0 gradient_clip_algorithm: null deterministic: null benchmark: null inference_mode: true use_distributed_sampler: true profiler: null detect_anomaly: false barebones: false plugins: null sync_batchnorm: false reload_dataloaders_every_n_epochs: 0 default_root_dir: null training_vis_frequency: 10000 training_metrics: acc_sc: _target_: routed.ocl.metrics.acc.EmbAccMetric mode: sc slot_emb_path: projector_slots ctrl_emb_path: lang_embedding mask_idx_path: input.contrastive_loss_mask acc_cs: _target_: routed.ocl.metrics.acc.EmbAccMetric mode: cs slot_emb_path: projector_slots ctrl_emb_path: lang_embedding mask_idx_path: input.contrastive_loss_mask acc_avg: _target_: routed.ocl.metrics.acc.EmbAccMetric mode: average slot_emb_path: projector_slots ctrl_emb_path: lang_embedding mask_idx_path: input.contrastive_loss_mask evaluation_metrics: binding_hits: _target_: routed.ocl.metrics.BindingHits prediction_path: object_decoder.masks_as_image target_path: input.instance_mask_v2 selected_indices_path: input.selected_indices use_threshold: false matching: best_overlap ignore_overlaps: false instance_ari: _target_: routed.ocl.metrics.ARIMetric prediction_path: object_decoder.masks_as_image target_path: input.instance_mask_v2 foreground: false convert_target_one_hot: true ignore_overlaps: true instance_mbo: _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric prediction_path: object_decoder.masks_as_image target_path: input.instance_mask use_threshold: false matching: best_overlap ignore_overlaps: true gt_matched_instance_mbo: _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric prediction_path: object_decoder.masks_as_image target_path: input.instance_mask_v2 selected_indices_path: input.selected_indices use_threshold: false matching: best_overlap ignore_overlaps: true acc_sc: _target_: routed.ocl.metrics.acc.EmbAccMetric mode: sc slot_emb_path: projector_slots ctrl_emb_path: lang_embedding mask_idx_path: input.contrastive_loss_mask acc_cs: _target_: routed.ocl.metrics.acc.EmbAccMetric mode: cs slot_emb_path: projector_slots ctrl_emb_path: lang_embedding mask_idx_path: input.contrastive_loss_mask acc_avg: _target_: routed.ocl.metrics.acc.EmbAccMetric mode: average slot_emb_path: projector_slots ctrl_emb_path: lang_embedding mask_idx_path: input.contrastive_loss_mask load_checkpoint: null load_checkpoint_partial: null modules_to_load: null trainable_models: null seed: null experiment: callbacks: {} checkpoint_every_n_steps: 1000 image_size: 224 mask_size: ${.image_size} batch_size_per_gpu: 128 base_learning_rate: 0.0004 max_num_binds: 7 slot_dim: 256 num_slots: 7 timm_model: vit_small_patch14_dinov2.lvd142m feature_dim: '${timm_model_dim: ${.timm_model}}' num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}' num_patches_per_side: '${isqrt: ${.num_patches}}' patch_size: '${timm_model_patch_size: ${.timm_model}}' total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}' total_lr: '${eval: ''a * (b / 64)**0.5'', ${.base_learning_rate}, ${.total_batch_size}}' mapping_lr: '${mul: 0.1, ${.total_lr}}'