cyrilvallez HF staff commited on
Commit
9a490d7
·
verified ·
1 Parent(s): 6cf9696

Upload folder using huggingface_hub

Browse files
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}"
3
+ }
config.json CHANGED
@@ -1,82 +1,43 @@
1
  {
2
- "_name_or_path": "Phi-4-multimodal-instruct",
3
  "architectures": [
4
- "Phi4MMForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
- "audio_processor": {
9
- "config": {
10
- "activation": "swish",
11
- "activation_checkpointing": {
12
- "interval": 1,
13
- "module": "transformer",
14
- "offload": false
15
- },
16
- "attention_dim": 1024,
17
- "attention_heads": 16,
18
- "batch_norm": false,
19
- "bias_in_glu": true,
20
- "causal": true,
21
- "chunk_size": -1,
22
- "cnn_layer_norm": true,
23
- "conv_activation": "swish",
24
- "conv_glu_type": "swish",
25
- "depthwise_multiplier": 1,
26
- "depthwise_seperable_out_channel": 1024,
27
- "dropout_rate": 0.0,
28
- "encoder_embedding_config": {
29
- "input_size": 80
30
- },
31
- "ext_pw_kernel_size": 1,
32
- "ext_pw_out_channel": 1024,
33
- "input_layer": "nemo_conv",
34
- "input_size": 80,
35
- "kernel_size": 3,
36
- "left_chunk": 18,
37
- "linear_units": 1536,
38
- "nemo_conv_settings": {
39
- "conv_channels": 1024
40
- },
41
- "num_blocks": 24,
42
- "relative_attention_bias_args": {
43
- "t5_bias_max_distance": 500,
44
- "type": "t5"
45
- },
46
- "time_reduction": 8
47
- },
48
- "name": "cascades"
49
- },
50
- "auto_map": {
51
- "AutoConfig": "configuration_phi4mm.Phi4MMConfig",
52
- "AutoModelForCausalLM": "modeling_phi4mm.Phi4MMForCausalLM",
53
- "AutoTokenizer": "Xenova/gpt-4o"
54
  },
55
  "bos_token_id": 199999,
56
- "embd_layer": {
57
- "audio_embd_layer": {
58
- "compression_rate": 8,
59
- "downsample_rate": 1,
60
- "embedding_cls": "audio",
61
- "enable_gradient_checkpointing": true,
62
- "projection_cls": "mlp",
63
- "use_conv_downsample": false,
64
- "use_qformer": false
65
- },
66
- "embedding_cls": "image_audio",
67
- "image_embd_layer": {
68
- "crop_size": 448,
69
- "embedding_cls": "tune_image",
70
- "enable_gradient_checkpointing": true,
71
- "hd_transform_order": "sub_glb",
72
- "image_token_compression_cls": "avg_pool_2d",
73
- "projection_cls": "mlp",
74
- "use_hd_transform": true,
75
- "with_learnable_separator": true
76
- }
77
- },
78
  "embd_pdrop": 0.0,
79
- "eos_token_id": 199999,
 
 
 
80
  "full_attn_mod": 1,
81
  "hidden_act": "silu",
82
  "hidden_size": 3072,
@@ -84,21 +45,9 @@
84
  "intermediate_size": 8192,
85
  "interpolate_factor": 1,
86
  "lm_head_bias": false,
87
- "vision_lora": {
88
- "dp": 0.0,
89
- "layer": "layers.*((self_attn\\.(qkv_proj|o_proj))|(mlp\\.(gate_up|down)_proj))",
90
- "lora_alpha": 512,
91
- "r": 256
92
- },
93
- "speech_lora": {
94
- "dp": 0.01,
95
- "layer": "((layers.*self_attn\\.(qkv|o)_proj)|(layers.*mlp\\.(gate_up|down)_proj))",
96
- "lora_alpha": 640,
97
- "r": 320
98
- },
99
  "max_position_embeddings": 131072,
100
  "mlp_bias": false,
101
- "model_type": "phi4mm",
102
  "num_attention_heads": 24,
103
  "num_hidden_layers": 32,
104
  "num_key_value_heads": 8,
@@ -214,8 +163,23 @@
214
  "sliding_window": 262144,
215
  "tie_word_embeddings": true,
216
  "torch_dtype": "bfloat16",
217
- "transformers_version": "4.46.1",
218
  "use_cache": true,
219
- "vocab_size": 200064,
220
- "_attn_implementation": "flash_attention_2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
 
1
  {
 
2
  "architectures": [
3
+ "Phi4MultimodalForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "audio_config": {
8
+ "activation": "swish",
9
+ "audio_token_id": 200011,
10
+ "bias_max_distance": 500,
11
+ "bias_symmetric": false,
12
+ "chunk_size": -1,
13
+ "conv_activation": "swish",
14
+ "conv_glu_type": "swish",
15
+ "depthwise_multiplier": 1,
16
+ "depthwise_seperable_out_channel": 1024,
17
+ "downsample_rate": 1,
18
+ "dropout_rate": 0.0,
19
+ "ext_pw_out_channel": 1024,
20
+ "feature_layer": -2,
21
+ "hidden_size": 1024,
22
+ "initializer_range": 0.02,
23
+ "input_size": 80,
24
+ "intermediate_size": 1536,
25
+ "kernel_size": 3,
26
+ "left_chunk": 18,
27
+ "model_type": "phi4_multimodal_audio",
28
+ "nemo_activation": "relu",
29
+ "nemo_conv_channels": 1024,
30
+ "nemo_final_size": 10,
31
+ "num_attention_heads": 16,
32
+ "num_blocks": 24,
33
+ "time_reduction": 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  "bos_token_id": 199999,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "embd_pdrop": 0.0,
37
+ "eos_token_id": [
38
+ 199999,
39
+ 200020
40
+ ],
41
  "full_attn_mod": 1,
42
  "hidden_act": "silu",
43
  "hidden_size": 3072,
 
45
  "intermediate_size": 8192,
46
  "interpolate_factor": 1,
47
  "lm_head_bias": false,
 
 
 
 
 
 
 
 
 
 
 
 
48
  "max_position_embeddings": 131072,
49
  "mlp_bias": false,
50
+ "model_type": "phi4_multimodal",
51
  "num_attention_heads": 24,
52
  "num_hidden_layers": 32,
53
  "num_key_value_heads": 8,
 
163
  "sliding_window": 262144,
164
  "tie_word_embeddings": true,
165
  "torch_dtype": "bfloat16",
166
+ "transformers_version": "4.51.0.dev0",
167
  "use_cache": true,
168
+ "vision_config": {
169
+ "attention_dropout": 0.0,
170
+ "crop_size": 448,
171
+ "feature_layer": -2,
172
+ "hidden_act": "gelu_pytorch_tanh",
173
+ "hidden_size": 1152,
174
+ "image_size": 448,
175
+ "image_token_id": 200010,
176
+ "intermediate_size": 4304,
177
+ "layer_norm_eps": 1e-06,
178
+ "model_type": "phi4_multimodal_vision",
179
+ "num_attention_heads": 16,
180
+ "num_channels": 3,
181
+ "num_hidden_layers": 27,
182
+ "patch_size": 14
183
+ },
184
+ "vocab_size": 200064
185
  }
generation_config.json CHANGED
@@ -2,10 +2,9 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 199999,
4
  "eos_token_id": [
5
- 200020,
6
- 199999
7
  ],
8
  "pad_token_id": 199999,
9
- "transformers_version": "4.46.1",
10
- "use_cache": true
11
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 199999,
4
  "eos_token_id": [
5
+ 199999,
6
+ 200020
7
  ],
8
  "pad_token_id": 199999,
9
+ "transformers_version": "4.51.0.dev0"
 
10
  }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57b93f5d0c9422c0b76b68119660187989bd8bb47848994376be3ac53eb61a95
3
+ size 4903637712
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd6f60df08041b5c48afe7d7624d4de6e9d7d86162dec7a7e908a71d595e2967
3
+ size 4584575136
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json CHANGED
@@ -1,14 +1,18 @@
1
  {
2
- "auto_map": {
3
- "AutoProcessor": "processing_phi4mm.Phi4MMProcessor",
4
- "AutoImageProcessor": "processing_phi4mm.Phi4MMImageProcessor",
5
- "AutoFeatureExtractor": "processing_phi4mm.Phi4MMAudioFeatureExtractor"
6
- },
7
- "image_processor_type": "Phi4MMImageProcessor",
8
- "processor_class": "Phi4MMProcessor",
9
- "feature_extractor_type": "Phi4MMAudioFeatureExtractor",
10
  "audio_compression_rate": 8,
11
  "audio_downsample_rate": 1,
12
  "audio_feat_stride": 1,
13
- "dynamic_hd": 36
 
 
 
 
 
 
 
 
 
 
 
 
14
  }
 
1
  {
 
 
 
 
 
 
 
 
2
  "audio_compression_rate": 8,
3
  "audio_downsample_rate": 1,
4
  "audio_feat_stride": 1,
5
+ "dynamic_hd": 36,
6
+ "feature_extractor_type": "Phi4MultimodalFeatureExtractor",
7
+ "feature_size": 80,
8
+ "hop_length": 160,
9
+ "image_processor_type": "Phi4MMImageProcessor",
10
+ "n_fft": 512,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "preemphasis": 0.97,
14
+ "processor_class": "Phi4MultimodalProcessor",
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_length": 400
18
  }
processor_config.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
- "auto_map": {
3
- "AutoProcessor": "processing_phi4mm.Phi4MMProcessor"
4
- },
5
- "processor_class": "Phi4MMProcessor"
6
  }
 
1
  {
2
+ "fake_audio_token_pattern": "<\\|audio_\\d+\\|>",
3
+ "fake_image_token_pattern": "<\\|image_\\d+\\|>",
4
+ "processor_class": "Phi4MultimodalProcessor"
 
5
  }
special_tokens_map.json CHANGED
@@ -13,7 +13,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<|endoftext|>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<|endoftext|>",
19
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
speech-lora/adapter_config.json CHANGED
@@ -1,23 +1,29 @@
1
  {
 
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "TBA",
4
  "bias": "none",
 
 
5
  "fan_in_fan_out": false,
6
- "inference_mode": true,
7
  "init_lora_weights": true,
 
8
  "layers_pattern": null,
9
  "layers_to_transform": null,
 
10
  "lora_alpha": 640,
 
11
  "lora_dropout": 0.01,
12
- "modules_to_save": [],
 
 
13
  "peft_type": "LORA",
14
  "r": 320,
 
15
  "revision": null,
16
- "target_modules": [
17
- "qkv_proj",
18
- "o_proj",
19
- "gate_up_proj",
20
- "down_proj"
21
- ],
22
- "task_type": "CAUSAL_LM"
23
  }
 
1
  {
2
+ "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
  "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
  "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
  "init_lora_weights": true,
11
+ "layer_replication": null,
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
+ "loftq_config": {},
15
  "lora_alpha": 640,
16
+ "lora_bias": false,
17
  "lora_dropout": 0.01,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
  "peft_type": "LORA",
22
  "r": 320,
23
+ "rank_pattern": {},
24
  "revision": null,
25
+ "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
 
 
 
29
  }
speech-lora/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c2237461a4d1f9292cd128147bd3f0f70326a48d5d79c8e0f7583b26c095b30
3
- size 922782296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f70b0aba566f6c30e67a11e90033453e9375d102e031cec40956a2a0e9771e
3
+ size 922777944
tokenizer_config.json CHANGED
@@ -1,24 +1,24 @@
1
  {
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
- "200010": {
5
- "content": "<|endoftext10|>",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
- "200011": {
13
- "content": "<|endoftext11|>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
- "199999": {
21
- "content": "<|endoftext|>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
@@ -114,12 +114,19 @@
114
  "special": true
115
  }
116
  },
 
117
  "bos_token": "<|endoftext|>",
118
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
119
  "clean_up_tokenization_spaces": false,
120
  "eos_token": "<|endoftext|>",
 
 
 
 
 
121
  "model_max_length": 131072,
122
  "pad_token": "<|endoftext|>",
123
- "tokenizer_class": "GPT2TokenizerFast",
 
124
  "unk_token": "<|endoftext|>"
125
  }
 
1
  {
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
+ "199999": {
5
+ "content": "<|endoftext|>",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "200010": {
13
+ "content": "<|endoftext10|>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "200011": {
21
+ "content": "<|endoftext11|>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
 
114
  "special": true
115
  }
116
  },
117
+ "audio_token": "<|endoftext11|>",
118
  "bos_token": "<|endoftext|>",
119
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
120
  "clean_up_tokenization_spaces": false,
121
  "eos_token": "<|endoftext|>",
122
+ "extra_special_tokens": {
123
+ "audio_token": "<|endoftext11|>",
124
+ "image_token": "<|endoftext10|>"
125
+ },
126
+ "image_token": "<|endoftext10|>",
127
  "model_max_length": 131072,
128
  "pad_token": "<|endoftext|>",
129
+ "processor_class": "Phi4MultimodalProcessor",
130
+ "tokenizer_class": "GPT2Tokenizer",
131
  "unk_token": "<|endoftext|>"
132
  }
vision-lora/adapter_config.json CHANGED
@@ -1,23 +1,29 @@
1
  {
 
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "TBA",
4
  "bias": "none",
 
 
5
  "fan_in_fan_out": false,
6
- "inference_mode": true,
7
  "init_lora_weights": true,
 
8
  "layers_pattern": null,
9
  "layers_to_transform": null,
 
10
  "lora_alpha": 512,
 
11
  "lora_dropout": 0.0,
12
- "modules_to_save": [],
 
 
13
  "peft_type": "LORA",
14
  "r": 256,
 
15
  "revision": null,
16
- "target_modules": [
17
- "qkv_proj",
18
- "o_proj",
19
- "gate_up_proj",
20
- "down_proj"
21
- ],
22
- "task_type": "CAUSAL_LM"
23
  }
 
1
  {
2
+ "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
  "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
  "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
  "init_lora_weights": true,
11
+ "layer_replication": null,
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
+ "loftq_config": {},
15
  "lora_alpha": 512,
16
+ "lora_bias": false,
17
  "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
  "peft_type": "LORA",
22
  "r": 256,
23
+ "rank_pattern": {},
24
  "revision": null,
25
+ "target_modules": "model.layers.\\d+.((self_attn.(qkv|o)_proj)|(mlp.(gate_up|down)_proj))",
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
 
 
 
29
  }
vision-lora/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1620b16722edf701038bf66e3cd46412c7cc5458e58df89e9f92cedb71fcbde8
3
- size 738232904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76facf464ca0246e9f5dc409520e83764e0b73fa66fdb561526e064133728f8a
3
+ size 738228552