andriizadaianchuk commited on
Commit
2f6fe44
·
verified ·
1 Parent(s): 76c0fe5

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +505 -0
config.yaml ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ use_epochs: false
3
+ num_workers: 4
4
+ batch_size: ${experiment.batch_size_per_gpu}
5
+ _target_: ocl.datasets.WebdatasetDataModule
6
+ train_shards: ${oc.env:DATASET_PREFIX}/refcocog/train/shard-{000001..000079}.tar
7
+ train_size: 24698
8
+ val_shards: ${oc.env:DATASET_PREFIX}/refcocog/val/shard-{000001..000014}.tar
9
+ val_size: 4650
10
+ test_shards: ${oc.env:DATASET_PREFIX}/refcocog/val/shard-{000001..000014}.tar
11
+ test_size: 4650
12
+ use_autopadding: true
13
+ eval_transforms:
14
+ 03a_preprocessing:
15
+ _target_: ocl.transforms.Map
16
+ transform:
17
+ _target_: torchvision.transforms.Compose
18
+ transforms:
19
+ - _target_: ocl.preprocessing.InstanceMasksToDenseMasks
20
+ - _target_: ocl.preprocessing.CopyFields
21
+ mapping:
22
+ instance_mask: instance_mask_v2
23
+ - _target_: ocl.preprocessing.SelectConditioningInfo
24
+ num_max_binds: ${experiment.max_num_binds}
25
+ num_slots: ${experiment.num_slots}
26
+ is_refcocog: true
27
+ - _target_: ocl.preprocessing.DropCrowdMasks
28
+ mask_key: instance_mask
29
+ crowd_key: instance_iscrowd
30
+ missing_okay: true
31
+ - _target_: ocl.preprocessing.AddSegmentationMaskFromInstanceMask
32
+ - _target_: ocl.preprocessing.AddEmptyMasks
33
+ mask_keys:
34
+ - instance_mask
35
+ - segmentation_mask
36
+ - instance_mask_v2
37
+ - _target_: ocl.preprocessing.DropEntries
38
+ keys:
39
+ - instance_category
40
+ - instance_iscrowd
41
+ fields:
42
+ - image
43
+ - instance_mask
44
+ - instance_category
45
+ - instance_iscrowd
46
+ - name
47
+ - bbox_centroids
48
+ - name_embedding
49
+ - references
50
+ - references_embedding
51
+ - selected_indices
52
+ - contrastive_loss_mask
53
+ - all_bbox_centroids
54
+ batch_transform: false
55
+ 03c_preprocessing:
56
+ _target_: ocl.transforms.SimpleTransform
57
+ transforms:
58
+ image:
59
+ _target_: torchvision.transforms.Compose
60
+ transforms:
61
+ - '${lambda_fn:''lambda image: image.copy()''}'
62
+ - _target_: torchvision.transforms.v2.ToImage
63
+ - _target_: torchvision.transforms.v2.ToDtype
64
+ dtype: ${torch_dtype:float32}
65
+ scale: true
66
+ - _target_: torchvision.transforms.v2.Normalize
67
+ mean:
68
+ - 0.485
69
+ - 0.456
70
+ - 0.406
71
+ std:
72
+ - 0.229
73
+ - 0.224
74
+ - 0.225
75
+ instance_mask:
76
+ _target_: torchvision.transforms.Compose
77
+ transforms:
78
+ - _target_: ocl.preprocessing.DenseMaskToTensor
79
+ instance_mask_v2:
80
+ _target_: torchvision.transforms.Compose
81
+ transforms:
82
+ - _target_: ocl.preprocessing.DenseMaskToTensor
83
+ segmentation_mask:
84
+ _target_: torchvision.transforms.Compose
85
+ transforms:
86
+ - _target_: ocl.preprocessing.DenseMaskToTensor
87
+ batch_transform: false
88
+ train_transforms:
89
+ 03a_preprocessing:
90
+ _target_: ocl.transforms.Map
91
+ transform:
92
+ _target_: torchvision.transforms.Compose
93
+ transforms:
94
+ - _target_: ocl.preprocessing.InstanceMasksToDenseMasks
95
+ - _target_: ocl.preprocessing.CopyFields
96
+ mapping:
97
+ instance_mask: instance_mask_v2
98
+ - _target_: ocl.preprocessing.SelectConditioningInfo
99
+ num_max_binds: ${experiment.max_num_binds}
100
+ num_slots: ${experiment.num_slots}
101
+ is_refcocog: true
102
+ - _target_: ocl.preprocessing.DropCrowdMasks
103
+ mask_key: instance_mask
104
+ crowd_key: instance_iscrowd
105
+ missing_okay: true
106
+ - _target_: ocl.preprocessing.AddSegmentationMaskFromInstanceMask
107
+ - _target_: ocl.preprocessing.AddEmptyMasks
108
+ mask_keys:
109
+ - instance_mask
110
+ - segmentation_mask
111
+ - _target_: ocl.preprocessing.DropEntries
112
+ keys:
113
+ - instance_category
114
+ - instance_iscrowd
115
+ fields:
116
+ - image
117
+ - instance_mask
118
+ - instance_category
119
+ - instance_iscrowd
120
+ - name
121
+ - bbox_centroids
122
+ - name_embedding
123
+ - references
124
+ - references_embedding
125
+ - selected_indices
126
+ - contrastive_loss_mask
127
+ batch_transform: false
128
+ 03b_preprocessing:
129
+ _target_: ocl.transforms.SimpleTransform
130
+ transforms:
131
+ image:
132
+ _target_: torchvision.transforms.Compose
133
+ transforms:
134
+ - '${lambda_fn:''lambda image: image.copy()''}'
135
+ - _target_: torchvision.transforms.v2.ToImage
136
+ - _target_: torchvision.transforms.v2.ToDtype
137
+ dtype: ${torch_dtype:float32}
138
+ scale: true
139
+ - _target_: torchvision.transforms.v2.Normalize
140
+ mean:
141
+ - 0.485
142
+ - 0.456
143
+ - 0.406
144
+ std:
145
+ - 0.229
146
+ - 0.224
147
+ - 0.225
148
+ name_embedding:
149
+ _target_: torchvision.transforms.Compose
150
+ transforms:
151
+ - '${lambda_fn:''lambda name_embedding: name_embedding.copy()''}'
152
+ - _target_: ocl.preprocessing.ToTensor
153
+ references_embedding:
154
+ _target_: torchvision.transforms.Compose
155
+ transforms:
156
+ - '${lambda_fn:''lambda references_embedding: references_embedding.copy()''}'
157
+ - _target_: ocl.preprocessing.ToTensor
158
+ bbox_centroids:
159
+ _target_: torchvision.transforms.Compose
160
+ transforms:
161
+ - '${lambda_fn:''lambda bbox_centroids: bbox_centroids.copy()''}'
162
+ - _target_: ocl.preprocessing.ToTensor
163
+ all_bbox_centroids:
164
+ _target_: torchvision.transforms.Compose
165
+ transforms:
166
+ - '${lambda_fn:''lambda all_bbox_centroids: all_bbox_centroids.copy()''}'
167
+ - _target_: ocl.preprocessing.ToTensor
168
+ selected_indices:
169
+ _target_: torchvision.transforms.Compose
170
+ transforms:
171
+ - '${lambda_fn:''lambda selected_indices: selected_indices.copy()''}'
172
+ - _target_: ocl.preprocessing.ToTensor
173
+ contrastive_loss_mask:
174
+ _target_: torchvision.transforms.Compose
175
+ transforms:
176
+ - '${lambda_fn:''lambda contrastive_loss_mask: contrastive_loss_mask.copy()''}'
177
+ - _target_: ocl.preprocessing.ToTensor
178
+ instance_mask:
179
+ _target_: torchvision.transforms.Compose
180
+ transforms:
181
+ - _target_: ocl.preprocessing.DenseMaskToTensor
182
+ instance_mask_v2:
183
+ _target_: torchvision.transforms.Compose
184
+ transforms:
185
+ - _target_: ocl.preprocessing.DenseMaskToTensor
186
+ batch_transform: false
187
+ models:
188
+ feature_extractor:
189
+ _target_: routed.ocl.feature_extractors.TimmFeatureExtractor
190
+ model_name: ${experiment.timm_model}
191
+ pretrained: ${when_testing:false,true}
192
+ freeze: true
193
+ feature_level: 12
194
+ video_path: input.image
195
+ dynamic_img_size: true
196
+ mapping:
197
+ _target_: routed.ocl.mapping.MLPMapping
198
+ dim: ${experiment.feature_dim}
199
+ x_path: feature_extractor
200
+ conditioning:
201
+ _target_: routed.ocl.conditioning.LangConditioning
202
+ n_slots: ${experiment.num_slots}
203
+ object_dim: ${experiment.slot_dim}
204
+ dual_conditioning: false
205
+ name_embedding_path: input.references_embedding
206
+ batch_size_path: input.batch_size
207
+ mask_path: input.contrastive_loss_mask
208
+ perceptual_grouping:
209
+ _target_: routed.ocl.perceptual_grouping.SlotAttentionGrouping
210
+ feature_dim: ${.object_dim}
211
+ object_dim: ${experiment.slot_dim}
212
+ use_projection_bias: false
213
+ positional_embedding:
214
+ _target_: ocl.neural_networks.wrappers.Sequential
215
+ _args_:
216
+ - _target_: ocl.neural_networks.positional_embedding.DummyPositionEmbed
217
+ - _target_: ocl.neural_networks.build_two_layer_mlp
218
+ input_dim: ${experiment.feature_dim}
219
+ output_dim: ${....feature_dim}
220
+ hidden_dim: '${mul: ${experiment.feature_dim}, 2}'
221
+ initial_layer_norm: true
222
+ ff_mlp:
223
+ _target_: ocl.neural_networks.build_two_layer_mlp
224
+ input_dim: ${..object_dim}
225
+ output_dim: ${..object_dim}
226
+ hidden_dim: '${mul: ${..object_dim}, 4}'
227
+ initial_layer_norm: true
228
+ residual: true
229
+ feature_path: mapping
230
+ conditioning_path: conditioning
231
+ attn_aggregation:
232
+ _target_: routed.ocl.heads.AttentionAggregationHead
233
+ dim: ${experiment.feature_dim}
234
+ attn_path: perceptual_grouping.feature_attributions
235
+ x_path: mapping.features
236
+ projector_slots:
237
+ _target_: routed.ocl.heads.SlotProjectorHead
238
+ dim: ${experiment.feature_dim}
239
+ embedding_dim: 4096
240
+ slots_path: attn_aggregation
241
+ dual_embedding:
242
+ _target_: routed.ocl.heads.LangEmbeddingHead
243
+ embedding_dim: 4096
244
+ name_embedding_path: input.references_embedding
245
+ dec_conditioning:
246
+ _target_: routed.ocl.decoder_conditioning.EncodeLangConditioning
247
+ dim: ${experiment.slot_dim}
248
+ language_path: input.references_embedding
249
+ mask_path: input.contrastive_loss_mask
250
+ object_decoder:
251
+ _target_: routed.ocl.decoding.PatchDecoder
252
+ decoder:
253
+ _target_: ocl.neural_networks.build_mlp
254
+ _partial_: true
255
+ features:
256
+ - 2048
257
+ - 2048
258
+ - 2048
259
+ object_dim: ${experiment.slot_dim}
260
+ output_dim: ${experiment.feature_dim}
261
+ num_patches: ${experiment.num_patches}
262
+ object_features_path: perceptual_grouping.objects
263
+ image_path: input.image
264
+ conditioned: true
265
+ condition_info_path: dec_conditioning
266
+ optimizers:
267
+ opt0:
268
+ _target_: ocl.optimization.OptimizationWrapper
269
+ optimizer:
270
+ _target_: torch.optim.AdamW
271
+ _partial_: true
272
+ lr: ${experiment.total_lr}
273
+ lr_scheduler:
274
+ _target_: ocl.scheduling.exponential_decay_after_optional_warmup
275
+ _partial_: true
276
+ decay_rate: 0.5
277
+ decay_steps: 100000
278
+ warmup_steps: 10000
279
+ parameter_groups:
280
+ _target_: ocl.optimization.ParameterGroupCreator
281
+ param_groups:
282
+ grouping:
283
+ params:
284
+ - models.perceptual_grouping
285
+ - models.conditioning
286
+ - models.object_decoder
287
+ - models.dec_conditioning
288
+ lr: ${experiment.total_lr}
289
+ weight_decay: 0.0
290
+ encoder:
291
+ params:
292
+ - models.mapping
293
+ - models.dual_embedding
294
+ - models.attn_aggregation
295
+ - models.projector_slots
296
+ lr: ${experiment.mapping_lr}
297
+ weight_decay: 0.0
298
+ losses:
299
+ mse:
300
+ _target_: routed.ocl.losses.ReconstructionLoss
301
+ loss_type: mse
302
+ input_path: object_decoder.reconstruction
303
+ target_path: feature_extractor.features
304
+ contrastive_loss:
305
+ _target_: routed.ocl.losses.DiagonalContrastiveLoss
306
+ x1_path: projector_slots
307
+ x2_path: dual_embedding
308
+ contrastive_loss_mask_path: input.contrastive_loss_mask
309
+ temp: 0.1
310
+ batch_contrastive: true
311
+ symmetric: false
312
+ weight: 0.2
313
+ visualizations:
314
+ input:
315
+ _target_: routed.ocl.visualizations.Image
316
+ n_instances: 32
317
+ denormalization:
318
+ _target_: ocl.preprocessing.Denormalize
319
+ mean:
320
+ - 0.485
321
+ - 0.456
322
+ - 0.406
323
+ std:
324
+ - 0.229
325
+ - 0.224
326
+ - 0.225
327
+ image_path: input.image
328
+ masks:
329
+ _target_: routed.ocl.visualizations.Mask
330
+ mask_path: object_decoder.masks_as_image
331
+ pred_segmentation:
332
+ _target_: routed.ocl.visualizations.Segmentation
333
+ denormalization:
334
+ _target_: ocl.preprocessing.Denormalize
335
+ mean:
336
+ - 0.485
337
+ - 0.456
338
+ - 0.406
339
+ std:
340
+ - 0.229
341
+ - 0.224
342
+ - 0.225
343
+ image_path: input.image
344
+ mask_path: object_decoder.masks_as_image
345
+ pred_segmentation_with_text:
346
+ _target_: routed.ocl.visualizations.SegmentationWithText
347
+ n_instances: 32
348
+ denormalization:
349
+ _target_: ocl.preprocessing.Denormalize
350
+ mean:
351
+ - 0.485
352
+ - 0.456
353
+ - 0.406
354
+ std:
355
+ - 0.229
356
+ - 0.224
357
+ - 0.225
358
+ image_path: input.image
359
+ mask_path: object_decoder.masks_as_image
360
+ gt_masks_path: input.instance_mask_v2
361
+ selected_indices_path: input.selected_indices
362
+ text_path: input.name
363
+ bbox_centroids_path: input.all_bbox_centroids
364
+ trainer:
365
+ _target_: pytorch_lightning.trainer.trainer.Trainer
366
+ accelerator: auto
367
+ strategy: auto
368
+ devices: 1
369
+ num_nodes: 1
370
+ precision: null
371
+ logger:
372
+ - _target_: pytorch_lightning.loggers.TensorBoardLogger
373
+ save_dir: .
374
+ name: tb
375
+ version: ''
376
+ - _target_: pytorch_lightning.loggers.WandbLogger
377
+ project: ${slice:${hydra:runtime.choices.experiment},"/", 1}_${slice:${hydra:runtime.choices.experiment},"/",
378
+ 2}
379
+ name: ${slice:${hydra:runtime.choices.experiment},"/","3:"}
380
+ log_model: false
381
+ callbacks: ${oc.dict.values:experiment.callbacks}
382
+ fast_dev_run: false
383
+ max_epochs: -1
384
+ min_epochs: null
385
+ max_steps: 200000
386
+ min_steps: null
387
+ max_time: null
388
+ limit_train_batches: null
389
+ limit_val_batches: null
390
+ limit_test_batches: null
391
+ limit_predict_batches: null
392
+ overfit_batches: 0.0
393
+ val_check_interval: 2000
394
+ check_val_every_n_epoch: null
395
+ num_sanity_val_steps: null
396
+ log_every_n_steps: 200
397
+ enable_checkpointing: null
398
+ enable_progress_bar: null
399
+ enable_model_summary: null
400
+ accumulate_grad_batches: 1
401
+ gradient_clip_val: 1.0
402
+ gradient_clip_algorithm: null
403
+ deterministic: null
404
+ benchmark: null
405
+ inference_mode: true
406
+ use_distributed_sampler: true
407
+ profiler: null
408
+ detect_anomaly: false
409
+ barebones: false
410
+ plugins: null
411
+ sync_batchnorm: false
412
+ reload_dataloaders_every_n_epochs: 0
413
+ default_root_dir: .
414
+ training_vis_frequency: 5000
415
+ training_metrics:
416
+ acc_sc:
417
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
418
+ mode: sc
419
+ slot_emb_path: projector_slots
420
+ ctrl_emb_path: dual_embedding
421
+ mask_idx_path: input.contrastive_loss_mask
422
+ acc_cs:
423
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
424
+ mode: cs
425
+ slot_emb_path: projector_slots
426
+ ctrl_emb_path: dual_embedding
427
+ mask_idx_path: input.contrastive_loss_mask
428
+ acc_avg:
429
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
430
+ mode: average
431
+ slot_emb_path: projector_slots
432
+ ctrl_emb_path: dual_embedding
433
+ mask_idx_path: input.contrastive_loss_mask
434
+ evaluation_metrics:
435
+ binding_hits:
436
+ _target_: routed.ocl.metrics.BindingHits
437
+ prediction_path: object_decoder.masks_as_image
438
+ target_path: input.instance_mask_v2
439
+ selected_indices_path: input.selected_indices
440
+ use_threshold: false
441
+ matching: best_overlap
442
+ ignore_overlaps: false
443
+ instance_ari:
444
+ _target_: routed.ocl.metrics.ARIMetric
445
+ prediction_path: object_decoder.masks_as_image
446
+ target_path: input.instance_mask_v2
447
+ foreground: false
448
+ convert_target_one_hot: true
449
+ ignore_overlaps: true
450
+ gt_matched_instance_mbo:
451
+ _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
452
+ prediction_path: object_decoder.masks_as_image
453
+ target_path: input.instance_mask_v2
454
+ selected_indices_path: input.selected_indices
455
+ use_threshold: false
456
+ matching: best_overlap
457
+ ignore_overlaps: true
458
+ instance_mbo:
459
+ _target_: routed.ocl.metrics.UnsupervisedMaskIoUMetric
460
+ prediction_path: object_decoder.masks_as_image
461
+ target_path: input.instance_mask
462
+ use_threshold: false
463
+ matching: best_overlap
464
+ ignore_overlaps: true
465
+ acc_sc:
466
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
467
+ mode: sc
468
+ slot_emb_path: projector_slots
469
+ ctrl_emb_path: dual_embedding
470
+ mask_idx_path: input.contrastive_loss_mask
471
+ acc_cs:
472
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
473
+ mode: cs
474
+ slot_emb_path: projector_slots
475
+ ctrl_emb_path: dual_embedding
476
+ mask_idx_path: input.contrastive_loss_mask
477
+ acc_avg:
478
+ _target_: routed.ocl.metrics.acc.EmbAccMetric
479
+ mode: average
480
+ slot_emb_path: projector_slots
481
+ ctrl_emb_path: dual_embedding
482
+ mask_idx_path: input.contrastive_loss_mask
483
+ load_checkpoint: null
484
+ load_checkpoint_partial: null
485
+ modules_to_load: null
486
+ trainable_models: null
487
+ seed: null
488
+ experiment:
489
+ callbacks: {}
490
+ checkpoint_every_n_steps: 2000
491
+ image_size: 224
492
+ mask_size: ${.image_size}
493
+ batch_size_per_gpu: 64
494
+ base_learning_rate: 0.0004
495
+ max_num_binds: 4
496
+ slot_dim: 256
497
+ num_slots: 4
498
+ timm_model: vit_small_patch14_dinov2.lvd142m
499
+ feature_dim: '${timm_model_dim: ${.timm_model}}'
500
+ num_patches: '${timm_model_num_patches: ${.timm_model}, ${.image_size}}'
501
+ num_patches_per_side: '${isqrt: ${.num_patches}}'
502
+ patch_size: '${timm_model_patch_size: ${.timm_model}}'
503
+ total_batch_size: '${mul: ${trainer.devices}, ${.batch_size_per_gpu}}'
504
+ total_lr: '${eval: ''a * (b / 64)**0.5'', ${.base_learning_rate}, ${.total_batch_size}}'
505
+ mapping_lr: '${mul: 0.1, ${.total_lr}}'