File size: 1,389 Bytes
7cdf421 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# ========= system global ========== #
models:
nextgpt:
model_name: NextGPTModel
agent_name: DeepSpeedAgent
seed: 13
max_length: 512 # max length of the user input prompt
logging_step: 5
num_clip_tokens: 77
gen_emb_dim: 768
pretrained_ckpt_path: ../ckpt/pretrained_ckpt/
# ========= LLM ========== #
vicuna_version: 7b_v0 # [7b_v0, ]
# ========= multimodal encoder ========== #
imagebind_version: huge
# ========= text-to-image alignment tuning ========== #
n_img_tokens: 4
text_emb_to_img_layers: [-1]
num_gen_img_tokens: 4
text_fc_to_img_mode: transformer # [qformer, transformer]
# ========= text-to-video alignment tuning ========== #
n_video_tokens: 24
text_emb_to_video_layers: [-1]
num_gen_video_tokens: 24
text_fc_to_video_mode: transformer # [qformer, transformer]
# ========= text-to-audio alignment tuning ========== #
n_audio_tokens: 8
text_emb_to_audio_layers: [-1]
num_gen_audio_tokens: 8
text_fc_to_audio_mode: transformer # [qformer, transformer]
# ========= image diffusion model ========== #
image_diffusion: runwayml/stable-diffusion-v1-5 # [runwayml/stable-diffusion-v1-5, stabilityai/stable-diffusion-2]
# ========= video diffusion model ========== #
video_diffusion: cerspense/zeroscope_v2_576w
# ========= audio diffusion model ========== #
audio_diffusion: cvssp/audioldm-l-full # [cvssp/audioldm-l-full, cvssp/audioldm-s-full-v2]
|