File size: 1,134 Bytes
3eb682b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
image_res: 224
batch_size_train: 8

batch_size_test: 64

warm_up: True



optimizer: {opt: adamW, lr: 2e-5, weight_decay: 0.02, prompt_lr: 1e-5}
schedular: {sched: cosine, scheduler_groups: 0 , lr: 2e-5, epochs: 30, min_lr: 1e-6, decay_rate: 1, warmup_lr: 1e-5, warmup_epochs: 4, cooldown_epochs: 0}

use_vis_prefix: True
start_layer_idx: 19
end_layer_idx: 31

injected_hidden_states: 6

lm_loss_weight: 0.1 
 
unfreeze_text_layer_norm: False
unfreeze_vision_layer_norm: False


num_workers: 4




replace_added_tokens: True


use_cache: False

shift_labels: False

append_eos_token: True

num_beams: 3
do_sample: False

# Prompt tuning
prompt_tuning: True 
prompt_len: 10 


modality: 'audio'
dataset_name: 'audiocaps'

train_split: 'audiocaps_caption_train'
val_split: 'audiocaps_caption_val'
test_split: 'audiocaps_caption_test'


melbins: 128
target_length: 1024
num_tries: 8

skip_norm: False
norm_mean: -4.2677393
norm_std: 4.5689974
noise: False

freqm_p: 24
timem_p: 96 





all_vis_tokens: False


vision_model_name: 'ast'
pretrained_model: '/gpfswork/rech/dyf/ugz83ue/.cache/torch/hub/checkpoints/audioset_10_10_0.4593.pth'