oclmodels
/

contrastive_loss_dinosaur_vg

Model card Files Files and versions Community

andriizadaianchuk commited on Oct 3

Commit

eb8e5cc

•

1 Parent(s): f4f79e3

Upload config.yaml with huggingface_hub

Browse files

Files changed (1) hide show

config.yaml +482 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,482 @@

+dataset:
+  use_epochs: false
+  num_workers: 4
+  batch_size: ${experiment.batch_size_per_gpu}
+  _target_: ocl.datasets.WebdatasetDataModule
+  train_shards: ${oc.env:DATASET_PREFIX}/vg/train/shard-{000000..000303}.tar
+  train_size: 118287
+  val_shards: ${oc.env:DATASET_PREFIX}/vg/val/shard-{000000..000037}.tar
+  val_size: 5000
+  test_shards: ${oc.env:DATASET_PREFIX}/vg/test/shard-{000000..000037}.tar
+  test_size: 40670
+  use_autopadding: true
+  eval_transforms:
+    03a_preprocessing:
+      _target_: ocl.transforms.Map
+      transform:
+        _target_: torchvision.transforms.Compose
+        transforms:
+        - _target_: ocl.preprocessing.CopyFields
+          mapping:
+            instance_mask: instance_mask_v2
+        - _target_: ocl.preprocessing.SelectConditioningInfoVG
+          num_max_binds: ${experiment.num_slots}
+          num_slots: ${experiment.num_slots}
+      fields:
+      - image
+      - instance_mask
+      - instance_category
+      - instance_iscrowd
+      - name
+      - bbox_centroids
+      - name_embedding
+      - selected_indices
+      - contrastive_loss_mask
+      - all_bbox_centroids
+      - all_names
+      - references
+      - tokens
+      batch_transform: false
+    03c_preprocessing:
+      _target_: ocl.transforms.SimpleTransform
+      transforms:
+        image:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda image: image.copy()''}'
+          - _target_: torchvision.transforms.v2.ToImage
+          - _target_: torchvision.transforms.v2.ToDtype
+            dtype: ${torch_dtype:float32}
+            scale: true
+          - _target_: torchvision.transforms.v2.Normalize
+            mean:
+            - 0.485
+            - 0.456
+            - 0.406
+            std:
+            - 0.229
+            - 0.224
+            - 0.225
+        instance_mask:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - _target_: ocl.preprocessing.IntegerToOneHotMask
+            output_axis: -3
+          - _target_: ocl.preprocessing.AddEmptyMasksVG
+          - _target_: ocl.preprocessing.DenseMaskToTensor
+        instance_mask_v2:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - _target_: ocl.preprocessing.IntegerToOneHotMask
+            output_axis: -3
+          - _target_: ocl.preprocessing.AddEmptyMasksVG
+          - _target_: ocl.preprocessing.DenseMaskToTensor
+      batch_transform: false
+  train_transforms:
+    03a_preprocessing:
+      _target_: ocl.transforms.Map
+      transform:
+        _target_: torchvision.transforms.Compose
+        transforms:
+        - _target_: ocl.preprocessing.CopyFields
+          mapping:
+            instance_mask: instance_mask_v2
+        - _target_: ocl.preprocessing.SelectConditioningInfoVG
+          num_max_binds: ${experiment.num_slots}
+          num_slots: ${experiment.num_slots}
+      fields:
+      - image
+      - instance_mask
+      - instance_category
+      - instance_iscrowd
+      - name
+      - bbox_centroids
+      - name_embedding
+      - selected_indices
+      - contrastive_loss_mask
+      - all_names
+      - references
+      - tokens
+      batch_transform: false
+    03b_preprocessing:
+      _target_: ocl.transforms.SimpleTransform
+      transforms:
+        image:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda image: image.copy()''}'
+          - _target_: torchvision.transforms.v2.ToImage
+          - _target_: torchvision.transforms.v2.ToDtype
+            dtype: ${torch_dtype:float32}
+            scale: true
+          - _target_: torchvision.transforms.v2.Normalize
+            mean:
+            - 0.485
+            - 0.456
+            - 0.406
+            std:
+            - 0.229
+            - 0.224
+            - 0.225
+        name_embedding:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}'
+          - _target_: ocl.preprocessing.ToTensor
+        bbox_centroids:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}'
+          - _target_: ocl.preprocessing.ToTensor
+        all_bbox_centroids:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}'
+          - _target_: ocl.preprocessing.ToTensor
+        selected_indices:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}'
+          - _target_: ocl.preprocessing.ToTensor
+        contrastive_loss_mask:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}'
+          - _target_: ocl.preprocessing.ToTensor
+        instance_mask:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - _target_: ocl.preprocessing.IntegerToOneHotMask
+            output_axis: -3
+          - _target_: ocl.preprocessing.AddEmptyMasksVG
+          - _target_: ocl.preprocessing.DenseMaskToTensor
+        instance_mask_v2:
+          _target_: torchvision.transforms.Compose
+          transforms:
+          - _target_: ocl.preprocessing.IntegerToOneHotMask
+            output_axis: -3
+          - _target_: ocl.preprocessing.AddEmptyMasksVG
+          - _target_: ocl.preprocessing.DenseMaskToTensor
+      batch_transform: false
+models:
+  feature_extractor:
+    _target_: routed.ocl.feature_extractors.TimmFeatureExtractor
+    model_name: ${experiment.timm_model}
+    pretrained: ${when_testing:false,true}
+    freeze: true
+    feature_level: 12
+    video_path: input.image
+    dynamic_img_size: true
+  mapping:
+    _target_: routed.ocl.mapping.MLPMapping
+    dim: ${experiment.feature_dim}
+    x_path: feature_extractor
+  conditioning:
+    _target_: routed.ocl.conditioning.LangConditioning
+    n_slots: ${experiment.num_slots}
+    object_dim: ${experiment.slot_dim}
+    dual_conditioning: false
+    name_embedding_path: input.name_embedding
+    batch_size_path: input.batch_size
+    mask_path: input.contrastive_loss_mask
+  perceptual_grouping:
+    _target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping
+    feature_dim: ${.object_dim}
+    object_dim: ${experiment.slot_dim}
+    use_projection_bias: false
+    positional_embedding:
+      _target_: ocl.neural_networks.wrappers.Sequential
+      _args_:
+      - _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed
+      - _target_: ocl.neural_networks.build_two_layer_mlp
+        input_dim: ${experiment.feature_dim}
+        output_dim: ${....feature_dim}
+        hidden_dim: '${mul: ${experiment.feature_dim}, 2}'
+        initial_layer_norm: true
+    ff_mlp:
+      _target_: ocl.neural_networks.build_two_layer_mlp
+      input_dim: ${..object_dim}
+      output_dim: ${..object_dim}
+      hidden_dim: '${mul: ${..object_dim}, 4}'
+      initial_layer_norm: true
+      residual: true
+    feature_path: mapping
+    conditioning_path: conditioning
+  attn_aggregation:
+    _target_: routed.ocl.heads.AttentionAggregationHead
+    dim: ${experiment.feature_dim}
+    attn_path: perceptual_grouping.feature_attributions
+    x_path: mapping.features
+  projector_slots:
+    _target_: routed.ocl.heads.SlotProjectorHead
+    dim: ${experiment.feature_dim}
+    embedding_dim: 4096
+    slots_path: attn_aggregation
+  lang_embedding:
+    _target_: routed.ocl.heads.LangEmbeddingHead
+    embedding_dim: 4096
+    name_embedding_path: input.name_embedding
+  point_embedding:
+    _target_: routed.ocl.heads.PointEmbeddingHead
+    embedding_dim: 4096
+    point_embedding_path: input.bbox_centroids
+  dec_conditioning:
+    _target_: routed.ocl.decoder_conditioning.EncodeLangConditioning
+    dim: ${experiment.slot_dim}
+    language_path: input.name_embedding
+    mask_path: input.contrastive_loss_mask
+  object_decoder:
+    _target_: routed.ocl.decoding.PatchDecoder
+    decoder:
+      _target_: ocl.neural_networks.build_mlp
+      _partial_: true
+      features:
+      - 2048
+      - 2048
+      - 2048
+    object_dim: ${experiment.slot_dim}
+    output_dim: ${experiment.feature_dim}
+    num_patches: ${experiment.num_patches}
+    object_features_path: perceptual_grouping.objects
+    image_path: input.image
+    conditioned: true
+    condition_info_path: dec_conditioning
+optimizers:
+  opt0:
+    _target_: ocl.optimization.OptimizationWrapper
+    optimizer:
+      _target_: torch.optim.AdamW
+      _partial_: true
+      lr: ${experiment.total_lr}
+    lr_scheduler:
+      _target_: ocl.scheduling.exponential_decay_after_optional_warmup
+      _partial_: true
+      decay_rate: 0.5
+      decay_steps: 100000
+      warmup_steps: 10000
+    parameter_groups:
+      _target_: ocl.optimization.ParameterGroupCreator
+      param_groups:
+        grouping:
+          params:
+          - models.perceptual_grouping
+          - models.conditioning
+          - models.object_decoder
+          - models.dec_conditioning
+          lr: ${experiment.total_lr}
+          weight_decay: 0.0
+        encoder:
+          params:
+          - models.mapping
+          - models.lang_embedding
+          - models.point_embedding
+          - models.attn_aggregation
+          - models.projector_slots
+          lr: ${experiment.mapping_lr}
+          weight_decay: 0.0
+losses:
+  mse:
+    _target_: routed.ocl.losses.ReconstructionLoss
+    loss_type: mse
+    input_path: object_decoder.reconstruction
+    target_path: feature_extractor.features
+  contrastive_loss_lang:
+    _target_: routed.ocl.losses.DiagonalContrastiveLoss
+    x1_path: projector_slots
+    x2_path: lang_embedding
+    contrastive_loss_mask_path: input.contrastive_loss_mask
+    temp: 0.1
+    batch_contrastive: true
+    weight: 0.2
+  contrastive_loss_point:
+    _target_: routed.ocl.losses.DiagonalContrastiveLoss
+    x1_path: projector_slots
+    x2_path: point_embedding
+    contrastive_loss_mask_path: input.contrastive_loss_mask
+    temp: 0.1
+    batch_contrastive: true
+    weight: 0.2
+visualizations:
+  input:
+    _target_: routed.ocl.visualizations.Image
+    n_instances: 32
+    denormalization:
+      _target_: ocl.preprocessing.Denormalize
+      mean:
+      - 0.485
+      - 0.456
+      - 0.406
+      std:
+      - 0.229
+      - 0.224
+      - 0.225
+    image_path: input.image
+  masks:
+    _target_: routed.ocl.visualizations.Mask
+    mask_path: object_decoder.masks_as_image
+  pred_segmentation:
+    _target_: routed.ocl.visualizations.Segmentation
+    denormalization:
+      _target_: ocl.preprocessing.Denormalize
+      mean:
+      - 0.485
+      - 0.456
+      - 0.406
+      std:
+      - 0.229
+      - 0.224
+      - 0.225
+    image_path: input.image
+    mask_path: object_decoder.masks_as_image
+  pred_segmentation_with_text:
+    _target_: routed.ocl.visualizations.SegmentationWithText
+    n_instances: 32
+    denormalization:
+      _target_: ocl.preprocessing.Denormalize
+      mean:
+      - 0.485
+      - 0.456
+      - 0.406
+      std:
+      - 0.229
+      - 0.224
+      - 0.225
+    image_path: input.image
+    mask_path: object_decoder.masks_as_image
+    gt_masks_path: input.instance_mask_v2
+    selected_indices_path: input.selected_indices
+    text_path: input.name
+    bbox_centroids_path: input.all_bbox_centroids
+trainer:
+  _target_: pytorch_lightning.trainer.trainer.Trainer
+  accelerator: auto
+  strategy: auto
+  devices: 1
+  num_nodes: 1
+  precision: null
+  logger: null
+  callbacks: ${oc.dict.values:experiment.callbacks}
+  fast_dev_run: false
+  max_epochs: -1
+  min_epochs: null
+  max_steps: 500000
+  min_steps: null
+  max_time: null
+  limit_train_batches: null
+  limit_val_batches: null
+  limit_test_batches: null
+  limit_predict_batches: null
+  overfit_batches: 0.0
+  val_check_interval: 5000
+  check_val_every_n_epoch: null
+  num_sanity_val_steps: null
+  log_every_n_steps: 100
+  enable_checkpointing: null
+  enable_progress_bar: null
+  enable_model_summary: null
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  gradient_clip_algorithm: null
+  deterministic: null
+  benchmark: null
+  inference_mode: true
+  use_distributed_sampler: true
+  profiler: null
+  detect_anomaly: false
+  barebones: false
+  plugins: null
+  sync_batchnorm: false
+  reload_dataloaders_every_n_epochs: 0
+  default_root_dir: null
+training_vis_frequency: 10000
+training_metrics:
+  acc_sc:
+    _target_: routed.ocl.metrics.acc.EmbAccMetric
+    mode: sc
+    slot_emb_path: projector_slots
+    ctrl_emb_path: lang_embedding
+    mask_idx_path: input.contrastive_loss_mask
+  acc_cs:
+    _target_: routed.ocl.metrics.acc.EmbAccMetric
+    mode: cs
+    slot_emb_path: projector_slots
+    ctrl_emb_path: lang_embedding
+    mask_idx_path: input.contrastive_loss_mask
+  acc_avg:
+    _target_: routed.ocl.metrics.acc.EmbAccMetric
+    mode: average
+    slot_emb_path: projector_slots
+    ctrl_emb_path: lang_embedding
+    mask_idx_path: input.contrastive_loss_mask
+evaluation_metrics:
+  binding_hits:
+    _target_: routed.ocl.metrics.BindingHits
+    prediction_path: object_decoder.masks_as_image
+    target_path: input.instance_mask_v2
+    selected_indices_path: input.selected_indices
+    use_threshold: false
+    matching: best_overlap
+    ignore_overlaps: false
+  instance_ari:
+    _target_: routed.ocl.metrics.ARIMetric
+    prediction_path: object_decoder.masks_as_image
+    target_path: input.instance_mask_v2
+    foreground: false
+    convert_target_one_hot: true
+    ignore_overlaps: true
+  instance_mbo:
+    _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
+    prediction_path: object_decoder.masks_as_image
+    target_path: input.instance_mask
+    use_threshold: false
+    matching: best_overlap
+    ignore_overlaps: true
+  gt_matched_instance_mbo:
+    _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
+    prediction_path: object_decoder.masks_as_image
+    target_path: input.instance_mask_v2
+    selected_indices_path: input.selected_indices
+    use_threshold: false
+    matching: best_overlap
+    ignore_overlaps: true
+  acc_sc:
+    _target_: routed.ocl.metrics.acc.EmbAccMetric
+    mode: sc
+    slot_emb_path: projector_slots
+    ctrl_emb_path: lang_embedding
+    mask_idx_path: input.contrastive_loss_mask
+  acc_cs:
+    _target_: routed.ocl.metrics.acc.EmbAccMetric
+    mode: cs
+    slot_emb_path: projector_slots
+    ctrl_emb_path: lang_embedding
+    mask_idx_path: input.contrastive_loss_mask
+  acc_avg:
+    _target_: routed.ocl.metrics.acc.EmbAccMetric
+    mode: average
+    slot_emb_path: projector_slots
+    ctrl_emb_path: lang_embedding
+    mask_idx_path: input.contrastive_loss_mask
+load_checkpoint: null
+load_checkpoint_partial: null
+modules_to_load: null
+trainable_models: null
+seed: null
+experiment:
+  callbacks: {}
+  checkpoint_every_n_steps: 1000
+  image_size: 224
+  mask_size: ${.image_size}
+  batch_size_per_gpu: 128
+  base_learning_rate: 0.0004
+  max_num_binds: 7
+  slot_dim: 256
+  num_slots: 7
+  timm_model: vit_small_patch14_dinov2.lvd142m
+  feature_dim: '${timm_model_dim: ${.timm_model}}'
+  num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}'
+  num_patches_per_side: '${isqrt: ${.num_patches}}'
+  patch_size: '${timm_model_patch_size: ${.timm_model}}'
+  total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}'
+  total_lr: '${eval: ''a * (b / 64)**0.5'', ${.base_learning_rate}, ${.total_batch_size}}'
+  mapping_lr: '${mul: 0.1, ${.total_lr}}'