andriizadaianchuk commited on
Commit
eb8e5cc
1 Parent(s): f4f79e3

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +482 -0
config.yaml ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ use_epochs: false
3
+ num_workers: 4
4
+ batch_size: ${experiment.batch_size_per_gpu}
5
+ _target_: ocl.datasets.WebdatasetDataModule
6
+ train_shards: ${oc.env:DATASET_PREFIX}/vg/train/shard-{000000..000303}.tar
7
+ train_size: 118287
8
+ val_shards: ${oc.env:DATASET_PREFIX}/vg/val/shard-{000000..000037}.tar
9
+ val_size: 5000
10
+ test_shards: ${oc.env:DATASET_PREFIX}/vg/test/shard-{000000..000037}.tar
11
+ test_size: 40670
12
+ use_autopadding: true
13
+ eval_transforms:
14
+ 03a_preprocessing:
15
+ _target_: ocl.transforms.Map
16
+ transform:
17
+ _target_: torchvision.transforms.Compose
18
+ transforms:
19
+ - _target_: ocl.preprocessing.CopyFields
20
+ mapping:
21
+ instance_mask: instance_mask_v2
22
+ - _target_: ocl.preprocessing.SelectConditioningInfoVG
23
+ num_max_binds: ${experiment.num_slots}
24
+ num_slots: ${experiment.num_slots}
25
+ fields:
26
+ - image
27
+ - instance_mask
28
+ - instance_category
29
+ - instance_iscrowd
30
+ - name
31
+ - bbox_centroids
32
+ - name_embedding
33
+ - selected_indices
34
+ - contrastive_loss_mask
35
+ - all_bbox_centroids
36
+ - all_names
37
+ - references
38
+ - tokens
39
+ batch_transform: false
40
+ 03c_preprocessing:
41
+ _target_: ocl.transforms.SimpleTransform
42
+ transforms:
43
+ image:
44
+ _target_: torchvision.transforms.Compose
45
+ transforms:
46
+ - '${lambda_fn:''lambda image: image.copy()''}'
47
+ - _target_: torchvision.transforms.v2.ToImage
48
+ - _target_: torchvision.transforms.v2.ToDtype
49
+ dtype: ${torch_dtype:float32}
50
+ scale: true
51
+ - _target_: torchvision.transforms.v2.Normalize
52
+ mean:
53
+ - 0.485
54
+ - 0.456
55
+ - 0.406
56
+ std:
57
+ - 0.229
58
+ - 0.224
59
+ - 0.225
60
+ instance_mask:
61
+ _target_: torchvision.transforms.Compose
62
+ transforms:
63
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
64
+ output_axis: -3
65
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
66
+ - _target_: ocl.preprocessing.DenseMaskToTensor
67
+ instance_mask_v2:
68
+ _target_: torchvision.transforms.Compose
69
+ transforms:
70
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
71
+ output_axis: -3
72
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
73
+ - _target_: ocl.preprocessing.DenseMaskToTensor
74
+ batch_transform: false
75
+ train_transforms:
76
+ 03a_preprocessing:
77
+ _target_: ocl.transforms.Map
78
+ transform:
79
+ _target_: torchvision.transforms.Compose
80
+ transforms:
81
+ - _target_: ocl.preprocessing.CopyFields
82
+ mapping:
83
+ instance_mask: instance_mask_v2
84
+ - _target_: ocl.preprocessing.SelectConditioningInfoVG
85
+ num_max_binds: ${experiment.num_slots}
86
+ num_slots: ${experiment.num_slots}
87
+ fields:
88
+ - image
89
+ - instance_mask
90
+ - instance_category
91
+ - instance_iscrowd
92
+ - name
93
+ - bbox_centroids
94
+ - name_embedding
95
+ - selected_indices
96
+ - contrastive_loss_mask
97
+ - all_names
98
+ - references
99
+ - tokens
100
+ batch_transform: false
101
+ 03b_preprocessing:
102
+ _target_: ocl.transforms.SimpleTransform
103
+ transforms:
104
+ image:
105
+ _target_: torchvision.transforms.Compose
106
+ transforms:
107
+ - '${lambda_fn:''lambda image: image.copy()''}'
108
+ - _target_: torchvision.transforms.v2.ToImage
109
+ - _target_: torchvision.transforms.v2.ToDtype
110
+ dtype: ${torch_dtype:float32}
111
+ scale: true
112
+ - _target_: torchvision.transforms.v2.Normalize
113
+ mean:
114
+ - 0.485
115
+ - 0.456
116
+ - 0.406
117
+ std:
118
+ - 0.229
119
+ - 0.224
120
+ - 0.225
121
+ name_embedding:
122
+ _target_: torchvision.transforms.Compose
123
+ transforms:
124
+ - '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}'
125
+ - _target_: ocl.preprocessing.ToTensor
126
+ bbox_centroids:
127
+ _target_: torchvision.transforms.Compose
128
+ transforms:
129
+ - '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}'
130
+ - _target_: ocl.preprocessing.ToTensor
131
+ all_bbox_centroids:
132
+ _target_: torchvision.transforms.Compose
133
+ transforms:
134
+ - '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}'
135
+ - _target_: ocl.preprocessing.ToTensor
136
+ selected_indices:
137
+ _target_: torchvision.transforms.Compose
138
+ transforms:
139
+ - '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}'
140
+ - _target_: ocl.preprocessing.ToTensor
141
+ contrastive_loss_mask:
142
+ _target_: torchvision.transforms.Compose
143
+ transforms:
144
+ - '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}'
145
+ - _target_: ocl.preprocessing.ToTensor
146
+ instance_mask:
147
+ _target_: torchvision.transforms.Compose
148
+ transforms:
149
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
150
+ output_axis: -3
151
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
152
+ - _target_: ocl.preprocessing.DenseMaskToTensor
153
+ instance_mask_v2:
154
+ _target_: torchvision.transforms.Compose
155
+ transforms:
156
+ - _target_: ocl.preprocessing.IntegerToOneHotMask
157
+ output_axis: -3
158
+ - _target_: ocl.preprocessing.AddEmptyMasksVG
159
+ - _target_: ocl.preprocessing.DenseMaskToTensor
160
+ batch_transform: false
161
+ models:
162
+ feature_extractor:
163
+ _target_: routed.ocl.feature_extractors.TimmFeatureExtractor
164
+ model_name: ${experiment.timm_model}
165
+ pretrained: ${when_testing:false,true}
166
+ freeze: true
167
+ feature_level: 12
168
+ video_path: input.image
169
+ dynamic_img_size: true
170
+ mapping:
171
+ _target_: routed.ocl.mapping.MLPMapping
172
+ dim: ${experiment.feature_dim}
173
+ x_path: feature_extractor
174
+ conditioning:
175
+ _target_: routed.ocl.conditioning.LangConditioning
176
+ n_slots: ${experiment.num_slots}
177
+ object_dim: ${experiment.slot_dim}
178
+ dual_conditioning: false
179
+ name_embedding_path: input.name_embedding
180
+ batch_size_path: input.batch_size
181
+ mask_path: input.contrastive_loss_mask
182
+ perceptual_grouping:
183
+ _target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping
184
+ feature_dim: ${.object_dim}
185
+ object_dim: ${experiment.slot_dim}
186
+ use_projection_bias: false
187
+ positional_embedding:
188
+ _target_: ocl.neural_networks.wrappers.Sequential
189
+ _args_:
190
+ - _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed
191
+ - _target_: ocl.neural_networks.build_two_layer_mlp
192
+ input_dim: ${experiment.feature_dim}
193
+ output_dim: ${....feature_dim}
194
+ hidden_dim: '${mul: ${experiment.feature_dim}, 2}'
195
+ initial_layer_norm: true
196
+ ff_mlp:
197
+ _target_: ocl.neural_networks.build_two_layer_mlp
198
+ input_dim: ${..object_dim}
199
+ output_dim: ${..object_dim}
200
+ hidden_dim: '${mul: ${..object_dim}, 4}'
201
+ initial_layer_norm: true
202
+ residual: true
203
+ feature_path: mapping
204
+ conditioning_path: conditioning
205
+ attn_aggregation:
206
+ _target_: routed.ocl.heads.AttentionAggregationHead
207
+ dim: ${experiment.feature_dim}
208
+ attn_path: perceptual_grouping.feature_attributions
209
+ x_path: mapping.features
210
+ projector_slots:
211
+ _target_: routed.ocl.heads.SlotProjectorHead
212
+ dim: ${experiment.feature_dim}
213
+ embedding_dim: 4096
214
+ slots_path: attn_aggregation
215
+ lang_embedding:
216
+ _target_: routed.ocl.heads.LangEmbeddingHead
217
+ embedding_dim: 4096
218
+ name_embedding_path: input.name_embedding
219
+ point_embedding:
220
+ _target_: routed.ocl.heads.PointEmbeddingHead
221
+ embedding_dim: 4096
222
+ point_embedding_path: input.bbox_centroids
223
+ dec_conditioning:
224
+ _target_: routed.ocl.decoder_conditioning.EncodeLangConditioning
225
+ dim: ${experiment.slot_dim}
226
+ language_path: input.name_embedding
227
+ mask_path: input.contrastive_loss_mask
228
+ object_decoder:
229
+ _target_: routed.ocl.decoding.PatchDecoder
230
+ decoder:
231
+ _target_: ocl.neural_networks.build_mlp
232
+ _partial_: true
233
+ features:
234
+ - 2048
235
+ - 2048
236
+ - 2048
237
+ object_dim: ${experiment.slot_dim}
238
+ output_dim: ${experiment.feature_dim}
239
+ num_patches: ${experiment.num_patches}
240
+ object_features_path: perceptual_grouping.objects
241
+ image_path: input.image
242
+ conditioned: true
243
+ condition_info_path: dec_conditioning
244
+ optimizers:
245
+ opt0:
246
+ _target_: ocl.optimization.OptimizationWrapper
247
+ optimizer:
248
+ _target_: torch.optim.AdamW
249
+ _partial_: true
250
+ lr: ${experiment.total_lr}
251
+ lr_scheduler:
252
+ _target_: ocl.scheduling.exponential_decay_after_optional_warmup
253
+ _partial_: true
254
+ decay_rate: 0.5
255
+ decay_steps: 100000
256
+ warmup_steps: 10000
257
+ parameter_groups:
258
+ _target_: ocl.optimization.ParameterGroupCreator
259
+ param_groups:
260
+ grouping:
261
+ params:
262
+ - models.perceptual_grouping
263
+ - models.conditioning
264
+ - models.object_decoder
265
+ - models.dec_conditioning
266
+ lr: ${experiment.total_lr}
267
+ weight_decay: 0.0
268
+ encoder:
269
+ params:
270
+ - models.mapping
271
+ - models.lang_embedding
272
+ - models.point_embedding
273
+ - models.attn_aggregation
274
+ - models.projector_slots
275
+ lr: ${experiment.mapping_lr}
276
+ weight_decay: 0.0
277
+ losses:
278
+ mse:
279
+ _target_: routed.ocl.losses.ReconstructionLoss
280
+ loss_type: mse
281
+ input_path: object_decoder.reconstruction
282
+ target_path: feature_extractor.features
283
+ contrastive_loss_lang:
284
+ _target_: routed.ocl.losses.DiagonalContrastiveLoss
285
+ x1_path: projector_slots
286
+ x2_path: lang_embedding
287
+ contrastive_loss_mask_path: input.contrastive_loss_mask
288
+ temp: 0.1
289
+ batch_contrastive: true
290
+ weight: 0.2
291
+ contrastive_loss_point:
292
+ _target_: routed.ocl.losses.DiagonalContrastiveLoss
293
+ x1_path: projector_slots
294
+ x2_path: point_embedding
295
+ contrastive_loss_mask_path: input.contrastive_loss_mask
296
+ temp: 0.1
297
+ batch_contrastive: true
298
+ weight: 0.2
299
+ visualizations:
300
+ input:
301
+ _target_: routed.ocl.visualizations.Image
302
+ n_instances: 32
303
+ denormalization:
304
+ _target_: ocl.preprocessing.Denormalize
305
+ mean:
306
+ - 0.485
307
+ - 0.456
308
+ - 0.406
309
+ std:
310
+ - 0.229
311
+ - 0.224
312
+ - 0.225
313
+ image_path: input.image
314
+ masks:
315
+ _target_: routed.ocl.visualizations.Mask
316
+ mask_path: object_decoder.masks_as_image
317
+ pred_segmentation:
318
+ _target_: routed.ocl.visualizations.Segmentation
319
+ denormalization:
320
+ _target_: ocl.preprocessing.Denormalize
321
+ mean:
322
+ - 0.485
323
+ - 0.456
324
+ - 0.406
325
+ std:
326
+ - 0.229
327
+ - 0.224
328
+ - 0.225
329
+ image_path: input.image
330
+ mask_path: object_decoder.masks_as_image
331
+ pred_segmentation_with_text:
332
+ _target_: routed.ocl.visualizations.SegmentationWithText
333
+ n_instances: 32
334
+ denormalization:
335
+ _target_: ocl.preprocessing.Denormalize
336
+ mean:
337
+ - 0.485
338
+ - 0.456
339
+ - 0.406
340
+ std:
341
+ - 0.229
342
+ - 0.224
343
+ - 0.225
344
+ image_path: input.image
345
+ mask_path: object_decoder.masks_as_image
346
+ gt_masks_path: input.instance_mask_v2
347
+ selected_indices_path: input.selected_indices
348
+ text_path: input.name
349
+ bbox_centroids_path: input.all_bbox_centroids
350
+ trainer:
351
+ _target_: pytorch_lightning.trainer.trainer.Trainer
352
+ accelerator: auto
353
+ strategy: auto
354
+ devices: 1
355
+ num_nodes: 1
356
+ precision: null
357
+ logger: null
358
+ callbacks: ${oc.dict.values:experiment.callbacks}
359
+ fast_dev_run: false
360
+ max_epochs: -1
361
+ min_epochs: null
362
+ max_steps: 500000
363
+ min_steps: null
364
+ max_time: null
365
+ limit_train_batches: null
366
+ limit_val_batches: null
367
+ limit_test_batches: null
368
+ limit_predict_batches: null
369
+ overfit_batches: 0.0
370
+ val_check_interval: 5000
371
+ check_val_every_n_epoch: null
372
+ num_sanity_val_steps: null
373
+ log_every_n_steps: 100
374
+ enable_checkpointing: null
375
+ enable_progress_bar: null
376
+ enable_model_summary: null
377
+ accumulate_grad_batches: 1
378
+ gradient_clip_val: 1.0
379
+ gradient_clip_algorithm: null
380
+ deterministic: null
381
+ benchmark: null
382
+ inference_mode: true
383
+ use_distributed_sampler: true
384
+ profiler: null
385
+ detect_anomaly: false
386
+ barebones: false
387
+ plugins: null
388
+ sync_batchnorm: false
389
+ reload_dataloaders_every_n_epochs: 0
390
+ default_root_dir: null
391
+ training_vis_frequency: 10000
392
+ training_metrics:
393
+ acc_sc:
394
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
395
+ mode: sc
396
+ slot_emb_path: projector_slots
397
+ ctrl_emb_path: lang_embedding
398
+ mask_idx_path: input.contrastive_loss_mask
399
+ acc_cs:
400
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
401
+ mode: cs
402
+ slot_emb_path: projector_slots
403
+ ctrl_emb_path: lang_embedding
404
+ mask_idx_path: input.contrastive_loss_mask
405
+ acc_avg:
406
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
407
+ mode: average
408
+ slot_emb_path: projector_slots
409
+ ctrl_emb_path: lang_embedding
410
+ mask_idx_path: input.contrastive_loss_mask
411
+ evaluation_metrics:
412
+ binding_hits:
413
+ _target_: routed.ocl.metrics.BindingHits
414
+ prediction_path: object_decoder.masks_as_image
415
+ target_path: input.instance_mask_v2
416
+ selected_indices_path: input.selected_indices
417
+ use_threshold: false
418
+ matching: best_overlap
419
+ ignore_overlaps: false
420
+ instance_ari:
421
+ _target_: routed.ocl.metrics.ARIMetric
422
+ prediction_path: object_decoder.masks_as_image
423
+ target_path: input.instance_mask_v2
424
+ foreground: false
425
+ convert_target_one_hot: true
426
+ ignore_overlaps: true
427
+ instance_mbo:
428
+ _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
429
+ prediction_path: object_decoder.masks_as_image
430
+ target_path: input.instance_mask
431
+ use_threshold: false
432
+ matching: best_overlap
433
+ ignore_overlaps: true
434
+ gt_matched_instance_mbo:
435
+ _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
436
+ prediction_path: object_decoder.masks_as_image
437
+ target_path: input.instance_mask_v2
438
+ selected_indices_path: input.selected_indices
439
+ use_threshold: false
440
+ matching: best_overlap
441
+ ignore_overlaps: true
442
+ acc_sc:
443
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
444
+ mode: sc
445
+ slot_emb_path: projector_slots
446
+ ctrl_emb_path: lang_embedding
447
+ mask_idx_path: input.contrastive_loss_mask
448
+ acc_cs:
449
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
450
+ mode: cs
451
+ slot_emb_path: projector_slots
452
+ ctrl_emb_path: lang_embedding
453
+ mask_idx_path: input.contrastive_loss_mask
454
+ acc_avg:
455
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
456
+ mode: average
457
+ slot_emb_path: projector_slots
458
+ ctrl_emb_path: lang_embedding
459
+ mask_idx_path: input.contrastive_loss_mask
460
+ load_checkpoint: null
461
+ load_checkpoint_partial: null
462
+ modules_to_load: null
463
+ trainable_models: null
464
+ seed: null
465
+ experiment:
466
+ callbacks: {}
467
+ checkpoint_every_n_steps: 1000
468
+ image_size: 224
469
+ mask_size: ${.image_size}
470
+ batch_size_per_gpu: 128
471
+ base_learning_rate: 0.0004
472
+ max_num_binds: 7
473
+ slot_dim: 256
474
+ num_slots: 7
475
+ timm_model: vit_small_patch14_dinov2.lvd142m
476
+ feature_dim: '${timm_model_dim: ${.timm_model}}'
477
+ num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}'
478
+ num_patches_per_side: '${isqrt: ${.num_patches}}'
479
+ patch_size: '${timm_model_patch_size: ${.timm_model}}'
480
+ total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}'
481
+ total_lr: '${eval: ''a * (b / 64)**0.5'', ${.base_learning_rate}, ${.total_batch_size}}'
482
+ mapping_lr: '${mul: 0.1, ${.total_lr}}'