andriizadaianchuk's picture
Upload config.yaml with huggingface_hub
eb8e5cc verified
dataset:
use_epochs: false
num_workers: 4
batch_size: ${experiment.batch_size_per_gpu}
_target_: ocl.datasets.WebdatasetDataModule
train_shards: ${oc.env:DATASET_PREFIX}/vg/train/shard-{000000..000303}.tar
train_size: 118287
val_shards: ${oc.env:DATASET_PREFIX}/vg/val/shard-{000000..000037}.tar
val_size: 5000
test_shards: ${oc.env:DATASET_PREFIX}/vg/test/shard-{000000..000037}.tar
test_size: 40670
use_autopadding: true
eval_transforms:
03a_preprocessing:
_target_: ocl.transforms.Map
transform:
_target_: torchvision.transforms.Compose
transforms:
- _target_: ocl.preprocessing.CopyFields
mapping:
instance_mask: instance_mask_v2
- _target_: ocl.preprocessing.SelectConditioningInfoVG
num_max_binds: ${experiment.num_slots}
num_slots: ${experiment.num_slots}
fields:
- image
- instance_mask
- instance_category
- instance_iscrowd
- name
- bbox_centroids
- name_embedding
- selected_indices
- contrastive_loss_mask
- all_bbox_centroids
- all_names
- references
- tokens
batch_transform: false
03c_preprocessing:
_target_: ocl.transforms.SimpleTransform
transforms:
image:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda image: image.copy()''}'
- _target_: torchvision.transforms.v2.ToImage
- _target_: torchvision.transforms.v2.ToDtype
dtype: ${torch_dtype:float32}
scale: true
- _target_: torchvision.transforms.v2.Normalize
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
instance_mask:
_target_: torchvision.transforms.Compose
transforms:
- _target_: ocl.preprocessing.IntegerToOneHotMask
output_axis: -3
- _target_: ocl.preprocessing.AddEmptyMasksVG
- _target_: ocl.preprocessing.DenseMaskToTensor
instance_mask_v2:
_target_: torchvision.transforms.Compose
transforms:
- _target_: ocl.preprocessing.IntegerToOneHotMask
output_axis: -3
- _target_: ocl.preprocessing.AddEmptyMasksVG
- _target_: ocl.preprocessing.DenseMaskToTensor
batch_transform: false
train_transforms:
03a_preprocessing:
_target_: ocl.transforms.Map
transform:
_target_: torchvision.transforms.Compose
transforms:
- _target_: ocl.preprocessing.CopyFields
mapping:
instance_mask: instance_mask_v2
- _target_: ocl.preprocessing.SelectConditioningInfoVG
num_max_binds: ${experiment.num_slots}
num_slots: ${experiment.num_slots}
fields:
- image
- instance_mask
- instance_category
- instance_iscrowd
- name
- bbox_centroids
- name_embedding
- selected_indices
- contrastive_loss_mask
- all_names
- references
- tokens
batch_transform: false
03b_preprocessing:
_target_: ocl.transforms.SimpleTransform
transforms:
image:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda image: image.copy()''}'
- _target_: torchvision.transforms.v2.ToImage
- _target_: torchvision.transforms.v2.ToDtype
dtype: ${torch_dtype:float32}
scale: true
- _target_: torchvision.transforms.v2.Normalize
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
name_embedding:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}'
- _target_: ocl.preprocessing.ToTensor
bbox_centroids:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}'
- _target_: ocl.preprocessing.ToTensor
all_bbox_centroids:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}'
- _target_: ocl.preprocessing.ToTensor
selected_indices:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}'
- _target_: ocl.preprocessing.ToTensor
contrastive_loss_mask:
_target_: torchvision.transforms.Compose
transforms:
- '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}'
- _target_: ocl.preprocessing.ToTensor
instance_mask:
_target_: torchvision.transforms.Compose
transforms:
- _target_: ocl.preprocessing.IntegerToOneHotMask
output_axis: -3
- _target_: ocl.preprocessing.AddEmptyMasksVG
- _target_: ocl.preprocessing.DenseMaskToTensor
instance_mask_v2:
_target_: torchvision.transforms.Compose
transforms:
- _target_: ocl.preprocessing.IntegerToOneHotMask
output_axis: -3
- _target_: ocl.preprocessing.AddEmptyMasksVG
- _target_: ocl.preprocessing.DenseMaskToTensor
batch_transform: false
models:
feature_extractor:
_target_: routed.ocl.feature_extractors.TimmFeatureExtractor
model_name: ${experiment.timm_model}
pretrained: ${when_testing:false,true}
freeze: true
feature_level: 12
video_path: input.image
dynamic_img_size: true
mapping:
_target_: routed.ocl.mapping.MLPMapping
dim: ${experiment.feature_dim}
x_path: feature_extractor
conditioning:
_target_: routed.ocl.conditioning.LangConditioning
n_slots: ${experiment.num_slots}
object_dim: ${experiment.slot_dim}
dual_conditioning: false
name_embedding_path: input.name_embedding
batch_size_path: input.batch_size
mask_path: input.contrastive_loss_mask
perceptual_grouping:
_target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping
feature_dim: ${.object_dim}
object_dim: ${experiment.slot_dim}
use_projection_bias: false
positional_embedding:
_target_: ocl.neural_networks.wrappers.Sequential
_args_:
- _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed
- _target_: ocl.neural_networks.build_two_layer_mlp
input_dim: ${experiment.feature_dim}
output_dim: ${....feature_dim}
hidden_dim: '${mul: ${experiment.feature_dim}, 2}'
initial_layer_norm: true
ff_mlp:
_target_: ocl.neural_networks.build_two_layer_mlp
input_dim: ${..object_dim}
output_dim: ${..object_dim}
hidden_dim: '${mul: ${..object_dim}, 4}'
initial_layer_norm: true
residual: true
feature_path: mapping
conditioning_path: conditioning
attn_aggregation:
_target_: routed.ocl.heads.AttentionAggregationHead
dim: ${experiment.feature_dim}
attn_path: perceptual_grouping.feature_attributions
x_path: mapping.features
projector_slots:
_target_: routed.ocl.heads.SlotProjectorHead
dim: ${experiment.feature_dim}
embedding_dim: 4096
slots_path: attn_aggregation
lang_embedding:
_target_: routed.ocl.heads.LangEmbeddingHead
embedding_dim: 4096
name_embedding_path: input.name_embedding
point_embedding:
_target_: routed.ocl.heads.PointEmbeddingHead
embedding_dim: 4096
point_embedding_path: input.bbox_centroids
dec_conditioning:
_target_: routed.ocl.decoder_conditioning.EncodeLangConditioning
dim: ${experiment.slot_dim}
language_path: input.name_embedding
mask_path: input.contrastive_loss_mask
object_decoder:
_target_: routed.ocl.decoding.PatchDecoder
decoder:
_target_: ocl.neural_networks.build_mlp
_partial_: true
features:
- 2048
- 2048
- 2048
object_dim: ${experiment.slot_dim}
output_dim: ${experiment.feature_dim}
num_patches: ${experiment.num_patches}
object_features_path: perceptual_grouping.objects
image_path: input.image
conditioned: true
condition_info_path: dec_conditioning
optimizers:
opt0:
_target_: ocl.optimization.OptimizationWrapper
optimizer:
_target_: torch.optim.AdamW
_partial_: true
lr: ${experiment.total_lr}
lr_scheduler:
_target_: ocl.scheduling.exponential_decay_after_optional_warmup
_partial_: true
decay_rate: 0.5
decay_steps: 100000
warmup_steps: 10000
parameter_groups:
_target_: ocl.optimization.ParameterGroupCreator
param_groups:
grouping:
params:
- models.perceptual_grouping
- models.conditioning
- models.object_decoder
- models.dec_conditioning
lr: ${experiment.total_lr}
weight_decay: 0.0
encoder:
params:
- models.mapping
- models.lang_embedding
- models.point_embedding
- models.attn_aggregation
- models.projector_slots
lr: ${experiment.mapping_lr}
weight_decay: 0.0
losses:
mse:
_target_: routed.ocl.losses.ReconstructionLoss
loss_type: mse
input_path: object_decoder.reconstruction
target_path: feature_extractor.features
contrastive_loss_lang:
_target_: routed.ocl.losses.DiagonalContrastiveLoss
x1_path: projector_slots
x2_path: lang_embedding
contrastive_loss_mask_path: input.contrastive_loss_mask
temp: 0.1
batch_contrastive: true
weight: 0.2
contrastive_loss_point:
_target_: routed.ocl.losses.DiagonalContrastiveLoss
x1_path: projector_slots
x2_path: point_embedding
contrastive_loss_mask_path: input.contrastive_loss_mask
temp: 0.1
batch_contrastive: true
weight: 0.2
visualizations:
input:
_target_: routed.ocl.visualizations.Image
n_instances: 32
denormalization:
_target_: ocl.preprocessing.Denormalize
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
image_path: input.image
masks:
_target_: routed.ocl.visualizations.Mask
mask_path: object_decoder.masks_as_image
pred_segmentation:
_target_: routed.ocl.visualizations.Segmentation
denormalization:
_target_: ocl.preprocessing.Denormalize
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
image_path: input.image
mask_path: object_decoder.masks_as_image
pred_segmentation_with_text:
_target_: routed.ocl.visualizations.SegmentationWithText
n_instances: 32
denormalization:
_target_: ocl.preprocessing.Denormalize
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
image_path: input.image
mask_path: object_decoder.masks_as_image
gt_masks_path: input.instance_mask_v2
selected_indices_path: input.selected_indices
text_path: input.name
bbox_centroids_path: input.all_bbox_centroids
trainer:
_target_: pytorch_lightning.trainer.trainer.Trainer
accelerator: auto
strategy: auto
devices: 1
num_nodes: 1
precision: null
logger: null
callbacks: ${oc.dict.values:experiment.callbacks}
fast_dev_run: false
max_epochs: -1
min_epochs: null
max_steps: 500000
min_steps: null
max_time: null
limit_train_batches: null
limit_val_batches: null
limit_test_batches: null
limit_predict_batches: null
overfit_batches: 0.0
val_check_interval: 5000
check_val_every_n_epoch: null
num_sanity_val_steps: null
log_every_n_steps: 100
enable_checkpointing: null
enable_progress_bar: null
enable_model_summary: null
accumulate_grad_batches: 1
gradient_clip_val: 1.0
gradient_clip_algorithm: null
deterministic: null
benchmark: null
inference_mode: true
use_distributed_sampler: true
profiler: null
detect_anomaly: false
barebones: false
plugins: null
sync_batchnorm: false
reload_dataloaders_every_n_epochs: 0
default_root_dir: null
training_vis_frequency: 10000
training_metrics:
acc_sc:
_target_: routed.ocl.metrics.acc.EmbAccMetric
mode: sc
slot_emb_path: projector_slots
ctrl_emb_path: lang_embedding
mask_idx_path: input.contrastive_loss_mask
acc_cs:
_target_: routed.ocl.metrics.acc.EmbAccMetric
mode: cs
slot_emb_path: projector_slots
ctrl_emb_path: lang_embedding
mask_idx_path: input.contrastive_loss_mask
acc_avg:
_target_: routed.ocl.metrics.acc.EmbAccMetric
mode: average
slot_emb_path: projector_slots
ctrl_emb_path: lang_embedding
mask_idx_path: input.contrastive_loss_mask
evaluation_metrics:
binding_hits:
_target_: routed.ocl.metrics.BindingHits
prediction_path: object_decoder.masks_as_image
target_path: input.instance_mask_v2
selected_indices_path: input.selected_indices
use_threshold: false
matching: best_overlap
ignore_overlaps: false
instance_ari:
_target_: routed.ocl.metrics.ARIMetric
prediction_path: object_decoder.masks_as_image
target_path: input.instance_mask_v2
foreground: false
convert_target_one_hot: true
ignore_overlaps: true
instance_mbo:
_target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
prediction_path: object_decoder.masks_as_image
target_path: input.instance_mask
use_threshold: false
matching: best_overlap
ignore_overlaps: true
gt_matched_instance_mbo:
_target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
prediction_path: object_decoder.masks_as_image
target_path: input.instance_mask_v2
selected_indices_path: input.selected_indices
use_threshold: false
matching: best_overlap
ignore_overlaps: true
acc_sc:
_target_: routed.ocl.metrics.acc.EmbAccMetric
mode: sc
slot_emb_path: projector_slots
ctrl_emb_path: lang_embedding
mask_idx_path: input.contrastive_loss_mask
acc_cs:
_target_: routed.ocl.metrics.acc.EmbAccMetric
mode: cs
slot_emb_path: projector_slots
ctrl_emb_path: lang_embedding
mask_idx_path: input.contrastive_loss_mask
acc_avg:
_target_: routed.ocl.metrics.acc.EmbAccMetric
mode: average
slot_emb_path: projector_slots
ctrl_emb_path: lang_embedding
mask_idx_path: input.contrastive_loss_mask
load_checkpoint: null
load_checkpoint_partial: null
modules_to_load: null
trainable_models: null
seed: null
experiment:
callbacks: {}
checkpoint_every_n_steps: 1000
image_size: 224
mask_size: ${.image_size}
batch_size_per_gpu: 128
base_learning_rate: 0.0004
max_num_binds: 7
slot_dim: 256
num_slots: 7
timm_model: vit_small_patch14_dinov2.lvd142m
feature_dim: '${timm_model_dim: ${.timm_model}}'
num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}'
num_patches_per_side: '${isqrt: ${.num_patches}}'
patch_size: '${timm_model_patch_size: ${.timm_model}}'
total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}'
total_lr: '${eval: ''a * (b / 64)**0.5'', ${.base_learning_rate}, ${.total_batch_size}}'
mapping_lr: '${mul: 0.1, ${.total_lr}}'