ZhifengKong's picture
upload
92740f3
raw
history blame contribute delete
597 Bytes
# TEXT ENCODER CONFIG
text_model: 'gpt2'
transformer_embed_dim: 768
freeze_text_encoder_weights: True
# AUDIO ENCODER CONFIG
audioenc_name: 'HTSAT'
out_emb: 768
sampling_rate: 44100
duration: 7
fmin: 50
fmax: 8000
n_fft: 1024
hop_size: 320
mel_bins: 64
window_size: 1024
# PROJECTION SPACE CONFIG
d_proj: 1024
temperature: 0.003
# TRAINING AND EVALUATION CONFIG
batch_size: 128
num_classes: 527
# CLAPCAP CONFIG
clapcap_model: 'ClapCaption'
text_decoder: 'gpt2'
prefix_length: 40
prefix_length_clip: 40
mapping_type: 'transformer'
num_layers: 8
normalize_prefix: True
freeze_gpt_weights: True