Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

chat_template.json +3 -0
config.json +51 -87
generation_config.json +3 -4
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
preprocessor_config.json +13 -9
processor_config.json +3 -4
special_tokens_map.json +7 -1
speech-lora/adapter_config.json +16 -10
speech-lora/adapter_model.safetensors +2 -2
tokenizer_config.json +14 -7
vision-lora/adapter_config.json +16 -10
vision-lora/adapter_model.safetensors +2 -2

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}"
+}

config.json CHANGED Viewed

@@ -1,82 +1,43 @@
 {
-  "_name_or_path": "Phi-4-multimodal-instruct",
   "architectures": [
-    "Phi4MMForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
-  "audio_processor": {
-    "config": {
-      "activation": "swish",
-      "activation_checkpointing": {
-        "interval": 1,
-        "module": "transformer",
-        "offload": false
-      },
-      "attention_dim": 1024,
-      "attention_heads": 16,
-      "batch_norm": false,
-      "bias_in_glu": true,
-      "causal": true,
-      "chunk_size": -1,
-      "cnn_layer_norm": true,
-      "conv_activation": "swish",
-      "conv_glu_type": "swish",
-      "depthwise_multiplier": 1,
-      "depthwise_seperable_out_channel": 1024,
-      "dropout_rate": 0.0,
-      "encoder_embedding_config": {
-        "input_size": 80
-      },
-      "ext_pw_kernel_size": 1,
-      "ext_pw_out_channel": 1024,
-      "input_layer": "nemo_conv",
-      "input_size": 80,
-      "kernel_size": 3,
-      "left_chunk": 18,
-      "linear_units": 1536,
-      "nemo_conv_settings": {
-        "conv_channels": 1024
-      },
-      "num_blocks": 24,
-      "relative_attention_bias_args": {
-        "t5_bias_max_distance": 500,
-        "type": "t5"
-      },
-      "time_reduction": 8
-    },
-    "name": "cascades"
-  },
-  "auto_map": {
-    "AutoConfig": "configuration_phi4mm.Phi4MMConfig",
-    "AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
-    "AutoTokenizer": "Xenova/gpt-4o"
   },
   "bos_token_id": 199999,
-  "embd_layer": {
-    "audio_embd_layer": {
-      "compression_rate": 8,
-      "downsample_rate": 1,
-      "embedding_cls": "audio",
-      "enable_gradient_checkpointing": true,
-      "projection_cls": "mlp",
-      "use_conv_downsample": false,
-      "use_qformer": false
-    },
-    "embedding_cls": "image_audio",
-    "image_embd_layer": {
-      "crop_size": 448,
-      "embedding_cls": "tune_image",
-      "enable_gradient_checkpointing": true,
-      "hd_transform_order": "sub_glb",
-      "image_token_compression_cls": "avg_pool_2d",
-      "projection_cls": "mlp",
-      "use_hd_transform": true,
-      "with_learnable_separator": true
-    }
-  },
   "embd_pdrop": 0.0,
-  "eos_token_id": 199999,
   "full_attn_mod": 1,
   "hidden_act": "silu",
   "hidden_size": 3072,
@@ -84,21 +45,9 @@
   "intermediate_size": 8192,
   "interpolate_factor": 1,
   "lm_head_bias": false,
-  "vision_lora": {
-    "dp": 0.0,
-    "layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
-    "lora_alpha": 512,
-    "r": 256
-  },
-  "speech_lora": {
-    "dp": 0.01,
-    "layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
-    "lora_alpha": 640,
-    "r": 320
-  },
   "max_position_embeddings": 131072,
   "mlp_bias": false,
-  "model_type": "phi4mm",
   "num_attention_heads": 24,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
@@ -214,8 +163,23 @@
   "sliding_window": 262144,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.46.1",
   "use_cache": true,
-  "vocab_size": 200064,
-  "_attn_implementation": "flash_attention_2"
 }

 {
   "architectures": [
+    "Phi4MultimodalForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "audio_config": {
+    "activation": "swish",
+    "audio_token_id": 200011,
+    "bias_max_distance": 500,
+    "bias_symmetric": false,
+    "chunk_size": -1,
+    "conv_activation": "swish",
+    "conv_glu_type": "swish",
+    "depthwise_multiplier": 1,
+    "depthwise_seperable_out_channel": 1024,
+    "downsample_rate": 1,
+    "dropout_rate": 0.0,
+    "ext_pw_out_channel": 1024,
+    "feature_layer": -2,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "input_size": 80,
+    "intermediate_size": 1536,
+    "kernel_size": 3,
+    "left_chunk": 18,
+    "model_type": "phi4_multimodal_audio",
+    "nemo_activation": "relu",
+    "nemo_conv_channels": 1024,
+    "nemo_final_size": 10,
+    "num_attention_heads": 16,
+    "num_blocks": 24,
+    "time_reduction": 8
   },
   "bos_token_id": 199999,
   "embd_pdrop": 0.0,
+  "eos_token_id": [
+    199999,
+    200020
+  ],
   "full_attn_mod": 1,
   "hidden_act": "silu",
   "hidden_size": 3072,
   "intermediate_size": 8192,
   "interpolate_factor": 1,
   "lm_head_bias": false,
   "max_position_embeddings": 131072,
   "mlp_bias": false,
+  "model_type": "phi4_multimodal",
   "num_attention_heads": 24,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "sliding_window": 262144,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0.dev0",
   "use_cache": true,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "crop_size": 448,
+    "feature_layer": -2,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 448,
+    "image_token_id": 200010,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "phi4_multimodal_vision",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  },
+  "vocab_size": 200064
 }

generation_config.json CHANGED Viewed

@@ -2,10 +2,9 @@
   "_from_model_config": true,
   "bos_token_id": 199999,
   "eos_token_id": [
-    200020,
-    199999
   ],
   "pad_token_id": 199999,
-  "transformers_version": "4.46.1",
-  "use_cache": true
 }

   "_from_model_config": true,
   "bos_token_id": 199999,
   "eos_token_id": [
+    199999,
+    200020
   ],
   "pad_token_id": 199999,
+  "transformers_version": "4.51.0.dev0"
 }

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57b93f5d0c9422c0b76b68119660187989bd8bb47848994376be3ac53eb61a95
+size 4903637712

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd6f60df08041b5c48afe7d7624d4de6e9d7d86162dec7a7e908a71d595e2967
+size 4584575136

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json CHANGED Viewed

@@ -1,14 +1,18 @@
 {
-  "auto_map": {
-    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor",
-    "AutoImageProcessor": "processing_phi4mm.Phi4MMImageProcessor",
-    "AutoFeatureExtractor": "processing_phi4mm.Phi4MMAudioFeatureExtractor"
-  },
-  "image_processor_type": "Phi4MMImageProcessor",
-  "processor_class": "Phi4MMProcessor",
-  "feature_extractor_type": "Phi4MMAudioFeatureExtractor",
   "audio_compression_rate": 8,
   "audio_downsample_rate": 1,
   "audio_feat_stride": 1,
-  "dynamic_hd": 36
 }

 {
   "audio_compression_rate": 8,
   "audio_downsample_rate": 1,
   "audio_feat_stride": 1,
+  "dynamic_hd": 36,
+  "feature_extractor_type": "Phi4MultimodalFeatureExtractor",
+  "feature_size": 80,
+  "hop_length": 160,
+  "image_processor_type": "Phi4MMImageProcessor",
+  "n_fft": 512,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "preemphasis": 0.97,
+  "processor_class": "Phi4MultimodalProcessor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "win_length": 400
 }

processor_config.json CHANGED Viewed

@@ -1,6 +1,5 @@
 {
-  "auto_map": {
-    "AutoProcessor": "processing_phi4mm.Phi4MMProcessor"
-  },
-  "processor_class": "Phi4MMProcessor"
 }

 {
+  "fake_audio_token_pattern": "<\\|audio_\\d+\\|>",
+  "fake_image_token_pattern": "<\\|image_\\d+\\|>",
+  "processor_class": "Phi4MultimodalProcessor"
 }

special_tokens_map.json CHANGED Viewed

@@ -13,7 +13,13 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|endoftext|>",
   "unk_token": {
     "content": "<|endoftext|>",
     "lstrip": false,

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "<|endoftext|>",
     "lstrip": false,

speech-lora/adapter_config.json CHANGED Viewed

@@ -1,23 +1,29 @@
 {
   "auto_mapping": null,
-  "base_model_name_or_path": "TBA",
   "bias": "none",
   "fan_in_fan_out": false,
-  "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 640,
   "lora_dropout": 0.01,
-  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 320,
   "revision": null,
-  "target_modules": [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj"
-  ],
-  "task_type": "CAUSAL_LM"
 }

 {
+  "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": null,
   "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
+  "inference_mode": false,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
+  "loftq_config": {},
   "lora_alpha": 640,
+  "lora_bias": false,
   "lora_dropout": 0.01,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
   "peft_type": "LORA",
   "r": 320,
+  "rank_pattern": {},
   "revision": null,
+  "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
 }

speech-lora/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c2237461a4d1f9292cd128147bd3f0f70326a48d5d79c8e0f7583b26c095b30
-size 922782296

 version https://git-lfs.github.com/spec/v1
+oid sha256:16f70b0aba566f6c30e67a11e90033453e9375d102e031cec40956a2a0e9771e
+size 922777944

tokenizer_config.json CHANGED Viewed

@@ -1,24 +1,24 @@
 {
   "add_prefix_space": false,
   "added_tokens_decoder": {
-    "200010": {
-      "content": "<|endoftext10|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "200011": {
-      "content": "<|endoftext11|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "199999": {
-      "content": "<|endoftext|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -114,12 +114,19 @@
       "special": true
     }
   },
   "bos_token": "<|endoftext|>",
   "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
-  "tokenizer_class": "GPT2TokenizerFast",
   "unk_token": "<|endoftext|>"
 }

 {
   "add_prefix_space": false,
   "added_tokens_decoder": {
+    "199999": {
+      "content": "<|endoftext|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "200010": {
+      "content": "<|endoftext10|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "200011": {
+      "content": "<|endoftext11|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     }
   },
+  "audio_token": "<|endoftext11|>",
   "bos_token": "<|endoftext|>",
   "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {
+    "audio_token": "<|endoftext11|>",
+    "image_token": "<|endoftext10|>"
+  },
+  "image_token": "<|endoftext10|>",
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
+  "processor_class": "Phi4MultimodalProcessor",
+  "tokenizer_class": "GPT2Tokenizer",
   "unk_token": "<|endoftext|>"
 }

vision-lora/adapter_config.json CHANGED Viewed

@@ -1,23 +1,29 @@
 {
   "auto_mapping": null,
-  "base_model_name_or_path": "TBA",
   "bias": "none",
   "fan_in_fan_out": false,
-  "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "lora_alpha": 512,
   "lora_dropout": 0.0,
-  "modules_to_save": [],
   "peft_type": "LORA",
   "r": 256,
   "revision": null,
-  "target_modules": [
-    "qkv_proj",
-    "o_proj",
-    "gate_up_proj",
-    "down_proj"
-  ],
-  "task_type": "CAUSAL_LM"
 }

 {
+  "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": null,
   "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
+  "inference_mode": false,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
+  "loftq_config": {},
   "lora_alpha": 512,
+  "lora_bias": false,
   "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
   "peft_type": "LORA",
   "r": 256,
+  "rank_pattern": {},
   "revision": null,
+  "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
 }

vision-lora/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1620b16722edf701038bf66e3cd46412c7cc5458e58df89e9f92cedb71fcbde8
-size 738232904

 version https://git-lfs.github.com/spec/v1
+oid sha256:76facf464ca0246e9f5dc409520e83764e0b73fa66fdb561526e064133728f8a
+size 738228552