all_vis_tokens: false append_eos_token: true batch_size_test: 64 batch_size_train: 8 dataset_name: audiocaps end_layer_idx: 31 freqm_p: 24 image_res: 224 injected_hidden_states: 6 lm_loss_weight: 0.1 melbins: 128 modality: audio noise: false norm_mean: -4.2677393 norm_std: 4.5689974 num_tries: 8 num_workers: 4 optimizer: {lr: 2e-05, opt: adamW, prompt_lr: 1e-05, weight_decay: 0.02} pretrained_model: /gpfswork/rech/dyf/ugz83ue/.cache/torch/hub/checkpoints/audioset_10_10_0.4593.pth prompt_len: 10 prompt_tuning: true replace_added_tokens: true schedular: {cooldown_epochs: 0, decay_rate: 1, epochs: 30, lr: 2e-05, min_lr: 1e-06, sched: cosine, scheduler_groups: 0, warmup_epochs: 4, warmup_lr: 1e-05} shift_labels: false skip_norm: false start_layer_idx: 19 target_length: 1024 test_split: audiocaps_caption_test timem_p: 96 train_split: audiocaps_caption_train unfreeze_text_layer_norm: false unfreeze_vision_layer_norm: false use_cache: false use_vis_prefix: true val_split: audiocaps_caption_val vision_model_name: ast warm_up: true