|
dataset: |
|
use_epochs: false |
|
num_workers: 4 |
|
batch_size: ${experiment.batch_size_per_gpu} |
|
_target_: ocl.datasets.WebdatasetDataModule |
|
train_shards: ${oc.env:DATASET_PREFIX}/vg/train/shard-{000000..000303}.tar |
|
train_size: 118287 |
|
val_shards: ${oc.env:DATASET_PREFIX}/vg/val/shard-{000000..000037}.tar |
|
val_size: 5000 |
|
test_shards: ${oc.env:DATASET_PREFIX}/vg/test/shard-{000000..000037}.tar |
|
test_size: 40670 |
|
use_autopadding: true |
|
eval_transforms: |
|
03a_preprocessing: |
|
_target_: ocl.transforms.Map |
|
transform: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- _target_: ocl.preprocessing.CopyFields |
|
mapping: |
|
instance_mask: instance_mask_v2 |
|
- _target_: ocl.preprocessing.SelectConditioningInfoVG |
|
num_max_binds: ${experiment.num_slots} |
|
num_slots: ${experiment.num_slots} |
|
fields: |
|
- image |
|
- instance_mask |
|
- instance_category |
|
- instance_iscrowd |
|
- name |
|
- bbox_centroids |
|
- name_embedding |
|
- selected_indices |
|
- contrastive_loss_mask |
|
- all_bbox_centroids |
|
- all_names |
|
- references |
|
- tokens |
|
batch_transform: false |
|
03c_preprocessing: |
|
_target_: ocl.transforms.SimpleTransform |
|
transforms: |
|
image: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda image: image.copy()''}' |
|
- _target_: torchvision.transforms.v2.ToImage |
|
- _target_: torchvision.transforms.v2.ToDtype |
|
dtype: ${torch_dtype:float32} |
|
scale: true |
|
- _target_: torchvision.transforms.v2.Normalize |
|
mean: |
|
- 0.485 |
|
- 0.456 |
|
- 0.406 |
|
std: |
|
- 0.229 |
|
- 0.224 |
|
- 0.225 |
|
instance_mask: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- _target_: ocl.preprocessing.IntegerToOneHotMask |
|
output_axis: -3 |
|
- _target_: ocl.preprocessing.AddEmptyMasksVG |
|
- _target_: ocl.preprocessing.DenseMaskToTensor |
|
instance_mask_v2: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- _target_: ocl.preprocessing.IntegerToOneHotMask |
|
output_axis: -3 |
|
- _target_: ocl.preprocessing.AddEmptyMasksVG |
|
- _target_: ocl.preprocessing.DenseMaskToTensor |
|
batch_transform: false |
|
train_transforms: |
|
03a_preprocessing: |
|
_target_: ocl.transforms.Map |
|
transform: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- _target_: ocl.preprocessing.CopyFields |
|
mapping: |
|
instance_mask: instance_mask_v2 |
|
- _target_: ocl.preprocessing.SelectConditioningInfoVG |
|
num_max_binds: ${experiment.num_slots} |
|
num_slots: ${experiment.num_slots} |
|
fields: |
|
- image |
|
- instance_mask |
|
- instance_category |
|
- instance_iscrowd |
|
- name |
|
- bbox_centroids |
|
- name_embedding |
|
- selected_indices |
|
- contrastive_loss_mask |
|
- all_names |
|
- references |
|
- tokens |
|
batch_transform: false |
|
03b_preprocessing: |
|
_target_: ocl.transforms.SimpleTransform |
|
transforms: |
|
image: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda image: image.copy()''}' |
|
- _target_: torchvision.transforms.v2.ToImage |
|
- _target_: torchvision.transforms.v2.ToDtype |
|
dtype: ${torch_dtype:float32} |
|
scale: true |
|
- _target_: torchvision.transforms.v2.Normalize |
|
mean: |
|
- 0.485 |
|
- 0.456 |
|
- 0.406 |
|
std: |
|
- 0.229 |
|
- 0.224 |
|
- 0.225 |
|
name_embedding: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}' |
|
- _target_: ocl.preprocessing.ToTensor |
|
bbox_centroids: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}' |
|
- _target_: ocl.preprocessing.ToTensor |
|
all_bbox_centroids: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}' |
|
- _target_: ocl.preprocessing.ToTensor |
|
selected_indices: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}' |
|
- _target_: ocl.preprocessing.ToTensor |
|
contrastive_loss_mask: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}' |
|
- _target_: ocl.preprocessing.ToTensor |
|
instance_mask: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- _target_: ocl.preprocessing.IntegerToOneHotMask |
|
output_axis: -3 |
|
- _target_: ocl.preprocessing.AddEmptyMasksVG |
|
- _target_: ocl.preprocessing.DenseMaskToTensor |
|
instance_mask_v2: |
|
_target_: torchvision.transforms.Compose |
|
transforms: |
|
- _target_: ocl.preprocessing.IntegerToOneHotMask |
|
output_axis: -3 |
|
- _target_: ocl.preprocessing.AddEmptyMasksVG |
|
- _target_: ocl.preprocessing.DenseMaskToTensor |
|
batch_transform: false |
|
models: |
|
feature_extractor: |
|
_target_: routed.ocl.feature_extractors.TimmFeatureExtractor |
|
model_name: ${experiment.timm_model} |
|
pretrained: ${when_testing:false,true} |
|
freeze: true |
|
feature_level: 12 |
|
video_path: input.image |
|
dynamic_img_size: true |
|
mapping: |
|
_target_: routed.ocl.mapping.MLPMapping |
|
dim: ${experiment.feature_dim} |
|
x_path: feature_extractor |
|
conditioning: |
|
_target_: routed.ocl.conditioning.LangConditioning |
|
n_slots: ${experiment.num_slots} |
|
object_dim: ${experiment.slot_dim} |
|
dual_conditioning: false |
|
name_embedding_path: input.name_embedding |
|
batch_size_path: input.batch_size |
|
mask_path: input.contrastive_loss_mask |
|
perceptual_grouping: |
|
_target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping |
|
feature_dim: ${.object_dim} |
|
object_dim: ${experiment.slot_dim} |
|
use_projection_bias: false |
|
positional_embedding: |
|
_target_: ocl.neural_networks.wrappers.Sequential |
|
_args_: |
|
- _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed |
|
- _target_: ocl.neural_networks.build_two_layer_mlp |
|
input_dim: ${experiment.feature_dim} |
|
output_dim: ${....feature_dim} |
|
hidden_dim: '${mul: ${experiment.feature_dim}, 2}' |
|
initial_layer_norm: true |
|
ff_mlp: |
|
_target_: ocl.neural_networks.build_two_layer_mlp |
|
input_dim: ${..object_dim} |
|
output_dim: ${..object_dim} |
|
hidden_dim: '${mul: ${..object_dim}, 4}' |
|
initial_layer_norm: true |
|
residual: true |
|
feature_path: mapping |
|
conditioning_path: conditioning |
|
attn_aggregation: |
|
_target_: routed.ocl.heads.AttentionAggregationHead |
|
dim: ${experiment.feature_dim} |
|
attn_path: perceptual_grouping.feature_attributions |
|
x_path: mapping.features |
|
projector_slots: |
|
_target_: routed.ocl.heads.SlotProjectorHead |
|
dim: ${experiment.feature_dim} |
|
embedding_dim: 4096 |
|
slots_path: attn_aggregation |
|
lang_embedding: |
|
_target_: routed.ocl.heads.LangEmbeddingHead |
|
embedding_dim: 4096 |
|
name_embedding_path: input.name_embedding |
|
point_embedding: |
|
_target_: routed.ocl.heads.PointEmbeddingHead |
|
embedding_dim: 4096 |
|
point_embedding_path: input.bbox_centroids |
|
dec_conditioning: |
|
_target_: routed.ocl.decoder_conditioning.EncodeLangConditioning |
|
dim: ${experiment.slot_dim} |
|
language_path: input.name_embedding |
|
mask_path: input.contrastive_loss_mask |
|
object_decoder: |
|
_target_: routed.ocl.decoding.PatchDecoder |
|
decoder: |
|
_target_: ocl.neural_networks.build_mlp |
|
_partial_: true |
|
features: |
|
- 2048 |
|
- 2048 |
|
- 2048 |
|
object_dim: ${experiment.slot_dim} |
|
output_dim: ${experiment.feature_dim} |
|
num_patches: ${experiment.num_patches} |
|
object_features_path: perceptual_grouping.objects |
|
image_path: input.image |
|
conditioned: true |
|
condition_info_path: dec_conditioning |
|
optimizers: |
|
opt0: |
|
_target_: ocl.optimization.OptimizationWrapper |
|
optimizer: |
|
_target_: torch.optim.AdamW |
|
_partial_: true |
|
lr: ${experiment.total_lr} |
|
lr_scheduler: |
|
_target_: ocl.scheduling.exponential_decay_after_optional_warmup |
|
_partial_: true |
|
decay_rate: 0.5 |
|
decay_steps: 100000 |
|
warmup_steps: 10000 |
|
parameter_groups: |
|
_target_: ocl.optimization.ParameterGroupCreator |
|
param_groups: |
|
grouping: |
|
params: |
|
- models.perceptual_grouping |
|
- models.conditioning |
|
- models.object_decoder |
|
- models.dec_conditioning |
|
lr: ${experiment.total_lr} |
|
weight_decay: 0.0 |
|
encoder: |
|
params: |
|
- models.mapping |
|
- models.lang_embedding |
|
- models.point_embedding |
|
- models.attn_aggregation |
|
- models.projector_slots |
|
lr: ${experiment.mapping_lr} |
|
weight_decay: 0.0 |
|
losses: |
|
mse: |
|
_target_: routed.ocl.losses.ReconstructionLoss |
|
loss_type: mse |
|
input_path: object_decoder.reconstruction |
|
target_path: feature_extractor.features |
|
contrastive_loss_lang: |
|
_target_: routed.ocl.losses.DiagonalContrastiveLoss |
|
x1_path: projector_slots |
|
x2_path: lang_embedding |
|
contrastive_loss_mask_path: input.contrastive_loss_mask |
|
temp: 0.1 |
|
batch_contrastive: true |
|
weight: 0.2 |
|
contrastive_loss_point: |
|
_target_: routed.ocl.losses.DiagonalContrastiveLoss |
|
x1_path: projector_slots |
|
x2_path: point_embedding |
|
contrastive_loss_mask_path: input.contrastive_loss_mask |
|
temp: 0.1 |
|
batch_contrastive: true |
|
weight: 0.2 |
|
visualizations: |
|
input: |
|
_target_: routed.ocl.visualizations.Image |
|
n_instances: 32 |
|
denormalization: |
|
_target_: ocl.preprocessing.Denormalize |
|
mean: |
|
- 0.485 |
|
- 0.456 |
|
- 0.406 |
|
std: |
|
- 0.229 |
|
- 0.224 |
|
- 0.225 |
|
image_path: input.image |
|
masks: |
|
_target_: routed.ocl.visualizations.Mask |
|
mask_path: object_decoder.masks_as_image |
|
pred_segmentation: |
|
_target_: routed.ocl.visualizations.Segmentation |
|
denormalization: |
|
_target_: ocl.preprocessing.Denormalize |
|
mean: |
|
- 0.485 |
|
- 0.456 |
|
- 0.406 |
|
std: |
|
- 0.229 |
|
- 0.224 |
|
- 0.225 |
|
image_path: input.image |
|
mask_path: object_decoder.masks_as_image |
|
pred_segmentation_with_text: |
|
_target_: routed.ocl.visualizations.SegmentationWithText |
|
n_instances: 32 |
|
denormalization: |
|
_target_: ocl.preprocessing.Denormalize |
|
mean: |
|
- 0.485 |
|
- 0.456 |
|
- 0.406 |
|
std: |
|
- 0.229 |
|
- 0.224 |
|
- 0.225 |
|
image_path: input.image |
|
mask_path: object_decoder.masks_as_image |
|
gt_masks_path: input.instance_mask_v2 |
|
selected_indices_path: input.selected_indices |
|
text_path: input.name |
|
bbox_centroids_path: input.all_bbox_centroids |
|
trainer: |
|
_target_: pytorch_lightning.trainer.trainer.Trainer |
|
accelerator: auto |
|
strategy: auto |
|
devices: 1 |
|
num_nodes: 1 |
|
precision: null |
|
logger: null |
|
callbacks: ${oc.dict.values:experiment.callbacks} |
|
fast_dev_run: false |
|
max_epochs: -1 |
|
min_epochs: null |
|
max_steps: 500000 |
|
min_steps: null |
|
max_time: null |
|
limit_train_batches: null |
|
limit_val_batches: null |
|
limit_test_batches: null |
|
limit_predict_batches: null |
|
overfit_batches: 0.0 |
|
val_check_interval: 5000 |
|
check_val_every_n_epoch: null |
|
num_sanity_val_steps: null |
|
log_every_n_steps: 100 |
|
enable_checkpointing: null |
|
enable_progress_bar: null |
|
enable_model_summary: null |
|
accumulate_grad_batches: 1 |
|
gradient_clip_val: 1.0 |
|
gradient_clip_algorithm: null |
|
deterministic: null |
|
benchmark: null |
|
inference_mode: true |
|
use_distributed_sampler: true |
|
profiler: null |
|
detect_anomaly: false |
|
barebones: false |
|
plugins: null |
|
sync_batchnorm: false |
|
reload_dataloaders_every_n_epochs: 0 |
|
default_root_dir: null |
|
training_vis_frequency: 10000 |
|
training_metrics: |
|
acc_sc: |
|
_target_: routed.ocl.metrics.acc.EmbAccMetric |
|
mode: sc |
|
slot_emb_path: projector_slots |
|
ctrl_emb_path: lang_embedding |
|
mask_idx_path: input.contrastive_loss_mask |
|
acc_cs: |
|
_target_: routed.ocl.metrics.acc.EmbAccMetric |
|
mode: cs |
|
slot_emb_path: projector_slots |
|
ctrl_emb_path: lang_embedding |
|
mask_idx_path: input.contrastive_loss_mask |
|
acc_avg: |
|
_target_: routed.ocl.metrics.acc.EmbAccMetric |
|
mode: average |
|
slot_emb_path: projector_slots |
|
ctrl_emb_path: lang_embedding |
|
mask_idx_path: input.contrastive_loss_mask |
|
evaluation_metrics: |
|
binding_hits: |
|
_target_: routed.ocl.metrics.BindingHits |
|
prediction_path: object_decoder.masks_as_image |
|
target_path: input.instance_mask_v2 |
|
selected_indices_path: input.selected_indices |
|
use_threshold: false |
|
matching: best_overlap |
|
ignore_overlaps: false |
|
instance_ari: |
|
_target_: routed.ocl.metrics.ARIMetric |
|
prediction_path: object_decoder.masks_as_image |
|
target_path: input.instance_mask_v2 |
|
foreground: false |
|
convert_target_one_hot: true |
|
ignore_overlaps: true |
|
instance_mbo: |
|
_target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric |
|
prediction_path: object_decoder.masks_as_image |
|
target_path: input.instance_mask |
|
use_threshold: false |
|
matching: best_overlap |
|
ignore_overlaps: true |
|
gt_matched_instance_mbo: |
|
_target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric |
|
prediction_path: object_decoder.masks_as_image |
|
target_path: input.instance_mask_v2 |
|
selected_indices_path: input.selected_indices |
|
use_threshold: false |
|
matching: best_overlap |
|
ignore_overlaps: true |
|
acc_sc: |
|
_target_: routed.ocl.metrics.acc.EmbAccMetric |
|
mode: sc |
|
slot_emb_path: projector_slots |
|
ctrl_emb_path: lang_embedding |
|
mask_idx_path: input.contrastive_loss_mask |
|
acc_cs: |
|
_target_: routed.ocl.metrics.acc.EmbAccMetric |
|
mode: cs |
|
slot_emb_path: projector_slots |
|
ctrl_emb_path: lang_embedding |
|
mask_idx_path: input.contrastive_loss_mask |
|
acc_avg: |
|
_target_: routed.ocl.metrics.acc.EmbAccMetric |
|
mode: average |
|
slot_emb_path: projector_slots |
|
ctrl_emb_path: lang_embedding |
|
mask_idx_path: input.contrastive_loss_mask |
|
load_checkpoint: null |
|
load_checkpoint_partial: null |
|
modules_to_load: null |
|
trainable_models: null |
|
seed: null |
|
experiment: |
|
callbacks: {} |
|
checkpoint_every_n_steps: 1000 |
|
image_size: 224 |
|
mask_size: ${.image_size} |
|
batch_size_per_gpu: 128 |
|
base_learning_rate: 0.0004 |
|
max_num_binds: 7 |
|
slot_dim: 256 |
|
num_slots: 7 |
|
timm_model: vit_small_patch14_dinov2.lvd142m |
|
feature_dim: '${timm_model_dim: ${.timm_model}}' |
|
num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}' |
|
num_patches_per_side: '${isqrt: ${.num_patches}}' |
|
patch_size: '${timm_model_patch_size: ${.timm_model}}' |
|
total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}' |
|
total_lr: '${eval: ''a * (b / 64)**0.5'', ${.base_learning_rate}, ${.total_batch_size}}' |
|
mapping_lr: '${mul: 0.1, ${.total_lr}}' |
|
|