{ "add_all_multimodal_tokens": true, "add_lm_head": true, "align_text_to_audio": false, "architectures": [ "LlavaModel" ], "audio_adapter": { "add_cross_attention": true, "attention_probs_dropout_prob": 0.1, "compress_factor": 2, "cross_attention_every_n_layers": 1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "hop_size": 0, "initializer_range": 0.02, "input_dim": 1024, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "model_type": "qformer", "num_attention_heads": 12, "num_hidden_layers": 4, "num_queries": 1, "output_dim": 8192, "transformers_version": "4.45.0", "triplet_loss": false, "window_size_in_seconds": 0.3333333333333 }, "audio_encoder": { "_name_or_path": "facebook/seamless-m4t-v2-large", "activation_dropout": 0.0, "activation_function": "relu", "adaptor_dropout": 0.1, "adaptor_kernel_size": 8, "adaptor_stride": 8, "add_adapter": true, "architectures": [ "SeamlessM4Tv2SpeechEncoder" ], "attention_dropout": 0.1, "bos_token_id": 2, "char_vocab_size": 10943, "conv_depthwise_kernel_size": 31, "decoder_attention_heads": 16, "decoder_ffn_dim": 8192, "decoder_layerdrop": 0.05, "decoder_layers": 24, "decoder_start_token_id": 3, "dropout": 0.1, "encoder_attention_heads": 16, "encoder_ffn_dim": 8192, "encoder_layerdrop": 0.05, "encoder_layers": 24, "eos_token_id": 3, "feature_projection_input_dim": 160, "hidden_size": 1024, "initializer_range": 0.02, "is_encoder_decoder": true, "lang_embed_dim": 256, "layer_norm_eps": 1e-05, "leaky_relu_slope": 0.1, "left_max_position_embeddings": 64, "max_new_tokens": 256, "max_position_embeddings": 4096, "model_type": "seamless_m4t_v2", "num_adapter_layers": 1, "num_attention_heads": 16, "num_hidden_layers": 24, "pad_token_id": 0, "position_embeddings_type": "relative_key", "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "resblock_kernel_sizes": [ 3, 7, 11 ], "right_max_position_embeddings": 8, "sampling_rate": 16000, "scale_embedding": true, "speech_encoder_attention_heads": 16, "speech_encoder_chunk_size": 20000, "speech_encoder_dropout": 0.0, "speech_encoder_hidden_act": "swish", "speech_encoder_intermediate_size": 4096, "speech_encoder_layerdrop": 0.1, "speech_encoder_layers": 24, "speech_encoder_left_chunk_num": 128, "spkr_embed_dim": 256, "t2u_bos_token_id": 0, "t2u_decoder_attention_heads": 16, "t2u_decoder_ffn_dim": 8192, "t2u_decoder_layers": 6, "t2u_encoder_attention_heads": 16, "t2u_encoder_ffn_dim": 8192, "t2u_encoder_layers": 6, "t2u_eos_token_id": 2, "t2u_max_position_embeddings": 4096, "t2u_pad_token_id": 1, "t2u_variance_pred_dropout": 0.5, "t2u_variance_predictor_embed_dim": 1024, "t2u_variance_predictor_hidden_dim": 256, "t2u_variance_predictor_kernel_size": 3, "t2u_vocab_size": 10082, "torch_dtype": "float32", "transformers_version": "4.45.0", "unit_embed_dim": 1280, "unit_hifi_gan_vocab_size": 10000, "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 11, 8, 8, 4, 4 ], "upsample_rates": [ 5, 4, 4, 2, 2 ], "use_cache": true, "var_pred_dropout": 0.5, "variance_predictor_kernel_size": 3, "vocab_size": 256102, "vocoder_num_langs": 36, "vocoder_num_spkrs": 200, "vocoder_offset": 4 }, "chunk_encoding_strategy": "loop", "chunk_overlap_in_seconds": 1, "chunk_size_in_seconds": 15, "codebook_weights": [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], "conversation_version": null, "hidden_size": 8192, "mm_use_audio_start_end": false, "mm_use_im_start_end": false, "mm_use_video_start_end": false, "model_type": "llava", "text_decoder": { "_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", "architectures": [ "LlamaForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 128000, "conversation_version": "llama_3_1", "eos_token_id": [ 128001, 128008, 128009 ], "head_dim": 128, "hidden_act": "silu", "hidden_size": 8192, "initializer_range": 0.02, "intermediate_size": 28672, "max_position_embeddings": 131072, "mlp_bias": false, "model_type": "llama", "num_attention_heads": 64, "num_hidden_layers": 80, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": { "factor": 8.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3" }, "rope_theta": 500000.0, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.45.0", "use_cache": true, "vocab_size": 128256 }, "tokenizer_padding_side": "right", "torch_dtype": "bfloat16", "transformers_version": "4.45.0", "video_adapter": { "force_input_projection": true, "force_output_projection": true, "hidden_layers": 4, "hidden_size": 4096, "input_dim": 768, "model_type": "mlp", "output_dim": 8192, "residual_type": "interpolation", "transformers_version": "4.45.0" }, "video_encoder": { "_name_or_path": "vsr_trlrs3vox2_base.pth", "a_upsample_ratio": 1, "adim": 768, "aheads": 12, "cnn_module_kernel": 31, "ctc_type": "builtin", "ddim": 768, "dheads": 12, "dlayers": 6, "dropout_rate": 0.1, "dunits": 3072, "elayers": 12, "eunits": 3072, "hidden_size": 768, "lsm_weight": 0.1, "macaron_style": true, "mtlalpha": 0.1, "rel_pos_type": "latest", "relu_type": "swish", "transformer_attn_dropout_rate": 0.1, "transformer_encoder_attn_layer_type": "rel_mha", "transformer_input_layer": "conv3d", "transformer_length_normalized_loss": false, "transformers_version": "4.45.0", "use_cnn_module": true, "zero_triu": false }, "vision_patch_merge_type": "flat", "vision_select_feature": "patch", "vision_select_layer": -1, "vision_use_patch_token": true }