WHB139426 commited on
Commit
db045e8
·
verified ·
1 Parent(s): a5ec89d

Upload folder using huggingface_hub

Browse files
Phi-3.5-vision-instruct/config.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Phi-3.5-vision-instruct",
3
+ "architectures": [
4
+ "Phi3VForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_phi3_v.Phi3VConfig",
9
+ "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM"
10
+ },
11
+ "bos_token_id": 1,
12
+ "embd_layer": {
13
+ "embedding_cls": "image",
14
+ "hd_transform_order": "sub_glb",
15
+ "projection_cls": "mlp",
16
+ "use_hd_transform": true,
17
+ "with_learnable_separator": true
18
+ },
19
+ "embd_pdrop": 0.0,
20
+ "eos_token_id": 2,
21
+ "hidden_act": "silu",
22
+ "hidden_size": 3072,
23
+ "img_processor": {
24
+ "image_dim_out": 1024,
25
+ "model_name": "openai/clip-vit-large-patch14-336",
26
+ "name": "clip_vision_model",
27
+ "num_img_tokens": 144
28
+ },
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 8192,
31
+ "max_position_embeddings": 131072,
32
+ "model_type": "phi3_v",
33
+ "num_attention_heads": 32,
34
+ "num_hidden_layers": 32,
35
+ "num_key_value_heads": 32,
36
+ "original_max_position_embeddings": 4096,
37
+ "pad_token_id": 32000,
38
+ "resid_pdrop": 0.0,
39
+ "rms_norm_eps": 1e-05,
40
+ "rope_scaling": {
41
+ "long_factor": [
42
+ 1.0800000429153442,
43
+ 1.1100000143051147,
44
+ 1.1399999856948853,
45
+ 1.340000033378601,
46
+ 1.5899999141693115,
47
+ 1.600000023841858,
48
+ 1.6200000047683716,
49
+ 2.620000123977661,
50
+ 3.2300000190734863,
51
+ 3.2300000190734863,
52
+ 4.789999961853027,
53
+ 7.400000095367432,
54
+ 7.700000286102295,
55
+ 9.09000015258789,
56
+ 12.199999809265137,
57
+ 17.670000076293945,
58
+ 24.46000099182129,
59
+ 28.57000160217285,
60
+ 30.420001983642578,
61
+ 30.840002059936523,
62
+ 32.590003967285156,
63
+ 32.93000411987305,
64
+ 42.320003509521484,
65
+ 44.96000289916992,
66
+ 50.340003967285156,
67
+ 50.45000457763672,
68
+ 57.55000305175781,
69
+ 57.93000411987305,
70
+ 58.21000289916992,
71
+ 60.1400032043457,
72
+ 62.61000442504883,
73
+ 62.62000274658203,
74
+ 62.71000289916992,
75
+ 63.1400032043457,
76
+ 63.1400032043457,
77
+ 63.77000427246094,
78
+ 63.93000411987305,
79
+ 63.96000289916992,
80
+ 63.970001220703125,
81
+ 64.02999877929688,
82
+ 64.06999969482422,
83
+ 64.08000183105469,
84
+ 64.12000274658203,
85
+ 64.41000366210938,
86
+ 64.4800033569336,
87
+ 64.51000213623047,
88
+ 64.52999877929688,
89
+ 64.83999633789062
90
+ ],
91
+ "short_factor": [
92
+ 1.08,
93
+ 1.1,
94
+ 1.1300000000000001,
95
+ 1.2800000000000002,
96
+ 1.3100000000000003,
97
+ 1.4500000000000004,
98
+ 1.4500000000000004,
99
+ 1.9500000000000008,
100
+ 2.030000000000001,
101
+ 2.4299999999999926,
102
+ 2.5699999999999896,
103
+ 2.9499999999999815,
104
+ 3.729999999999965,
105
+ 3.869999999999962,
106
+ 4.189999999999955,
107
+ 4.43999999999995,
108
+ 4.6399999999999455,
109
+ 4.979999999999938,
110
+ 5.159999999999934,
111
+ 5.279999999999932,
112
+ 5.759999999999922,
113
+ 5.889999999999919,
114
+ 5.889999999999919,
115
+ 5.969999999999917,
116
+ 6.089999999999915,
117
+ 6.2799999999999105,
118
+ 6.7699999999999,
119
+ 6.8899999999998975,
120
+ 7.109999999999893,
121
+ 7.129999999999892,
122
+ 7.179999999999891,
123
+ 7.289999999999889,
124
+ 7.339999999999888,
125
+ 7.559999999999883,
126
+ 7.619999999999882,
127
+ 7.69999999999988,
128
+ 7.879999999999876,
129
+ 7.879999999999876,
130
+ 7.879999999999876,
131
+ 7.939999999999875,
132
+ 7.949999999999875,
133
+ 7.979999999999874,
134
+ 8.19999999999987,
135
+ 8.439999999999864,
136
+ 8.469999999999864,
137
+ 8.589999999999861,
138
+ 8.809999999999857,
139
+ 8.999999999999853
140
+ ],
141
+ "type": "su"
142
+ },
143
+ "rope_theta": 10000.0,
144
+ "sliding_window": 262144,
145
+ "tie_word_embeddings": false,
146
+ "torch_dtype": "bfloat16",
147
+ "transformers_version": "4.38.1",
148
+ "use_cache": true,
149
+ "vocab_size": 32064,
150
+ "_attn_implementation": "flash_attention_2"
151
+ }
Phi-3.5-vision-instruct/configuration_phi3_v.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Phi-3-V model configuration"""
17
+
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
+ "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
27
+ "microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct/resolve/main/config.json",
28
+ }
29
+
30
+
31
+ class Phi3VConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the
36
+ [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 32064):
41
+ Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`Phi3VModel`].
43
+ hidden_size (`int`, *optional*, defaults to 3072):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 8192):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer decoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 32):
50
+ Number of attention heads for each attention layer in the Transformer decoder.
51
+ num_key_value_heads (`int`, *optional*):
52
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56
+ by meanpooling all the original heads within that group. For more details checkout [this
57
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
58
+ `num_attention_heads`.
59
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
60
+ Dropout probability for mlp outputs.
61
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
62
+ The dropout ratio for the embeddings.
63
+ attention_dropout (`float`, *optional*, defaults to 0.0):
64
+ The dropout ratio after computing the attention scores.
65
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
66
+ The non-linear activation function (function or string) in the decoder.
67
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
68
+ The maximum sequence length that this model might ever be used with.
69
+ original_max_position_embeddings (`int`, *optional*, defaults to 4096):
70
+ The maximum sequence length that this model was trained with. This is used to determine the size of the
71
+ original RoPE embeddings when using long scaling.
72
+ initializer_range (`float`, *optional*, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
75
+ The epsilon value used for the RMSNorm.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
79
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
80
+ Whether to tie weight embeddings
81
+ rope_theta (`float`, *optional*, defaults to 10000.0):
82
+ The base period of the RoPE embeddings.
83
+ rope_scaling (`dict`, *optional*):
84
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
85
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
86
+ the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
87
+ divided by the number of attention heads divided by 2.
88
+ bos_token_id (`int`, *optional*, defaults to 1):
89
+ The id of the "beginning-of-sequence" token.
90
+ eos_token_id (`int`, *optional*, defaults to 32000):
91
+ The id of the "end-of-sequence" token.
92
+ pad_token_id (`int`, *optional*, defaults to 32000):
93
+ The id of the padding token.
94
+ sliding_window (`int`, *optional*):
95
+ Sliding window attention window size. If `None`, no sliding window is applied.
96
+ embd_layer (`str`, *optional*, defaults to `"default"`):
97
+ The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text.
98
+ Example:
99
+ ```python
100
+ >>> from transformers import Phi3VModel, Phi3VConfig
101
+ >>> # Initializing a Phi-3-V style configuration
102
+ >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")
103
+ >>> # Initializing a model from the configuration
104
+ >>> model = Phi3VModel(configuration)
105
+ >>> # Accessing the model configuration
106
+ >>> configuration = model.config
107
+ ```"""
108
+
109
+ model_type = "phi3_v"
110
+ keys_to_ignore_at_inference = ["past_key_values"]
111
+
112
+ def __init__(
113
+ self,
114
+ vocab_size=32064,
115
+ hidden_size=3072,
116
+ intermediate_size=8192,
117
+ num_hidden_layers=32,
118
+ num_attention_heads=32,
119
+ num_key_value_heads=None,
120
+ resid_pdrop=0.0,
121
+ embd_pdrop=0.0,
122
+ attention_dropout=0.0,
123
+ hidden_act="silu",
124
+ max_position_embeddings=4096,
125
+ original_max_position_embeddings=4096,
126
+ initializer_range=0.02,
127
+ rms_norm_eps=1e-5,
128
+ use_cache=True,
129
+ tie_word_embeddings=False,
130
+ rope_theta=10000.0,
131
+ rope_scaling=None,
132
+ bos_token_id=1,
133
+ eos_token_id=32000,
134
+ pad_token_id=32000,
135
+ sliding_window=None,
136
+ embd_layer: str = "default",
137
+ **kwargs,
138
+ ):
139
+ self.vocab_size = vocab_size
140
+ self.hidden_size = hidden_size
141
+ self.intermediate_size = intermediate_size
142
+ self.num_hidden_layers = num_hidden_layers
143
+ self.num_attention_heads = num_attention_heads
144
+
145
+ if num_key_value_heads is None:
146
+ num_key_value_heads = num_attention_heads
147
+
148
+ self.num_key_value_heads = num_key_value_heads
149
+ self.resid_pdrop = resid_pdrop
150
+ self.embd_pdrop = embd_pdrop
151
+ self.attention_dropout = attention_dropout
152
+ self.hidden_act = hidden_act
153
+ self.max_position_embeddings = max_position_embeddings
154
+ self.original_max_position_embeddings = original_max_position_embeddings
155
+ self.initializer_range = initializer_range
156
+ self.rms_norm_eps = rms_norm_eps
157
+ self.use_cache = use_cache
158
+ self.rope_theta = rope_theta
159
+ self.rope_scaling = rope_scaling
160
+ self._rope_scaling_validation()
161
+ self.sliding_window = sliding_window
162
+ self.embd_layer = embd_layer
163
+
164
+
165
+ super().__init__(
166
+ bos_token_id=bos_token_id,
167
+ eos_token_id=eos_token_id,
168
+ pad_token_id=pad_token_id,
169
+ tie_word_embeddings=tie_word_embeddings,
170
+ **kwargs,
171
+ )
172
+
173
+ def _rope_scaling_validation(self):
174
+ """
175
+ Validate the `rope_scaling` configuration.
176
+ """
177
+ if self.rope_scaling is None:
178
+ return
179
+
180
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
181
+ raise ValueError(
182
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
183
+ f"got {self.rope_scaling}"
184
+ )
185
+ rope_scaling_type = self.rope_scaling.get("type", None)
186
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
187
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
188
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
189
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
190
+ if not (
191
+ isinstance(rope_scaling_short_factor, list)
192
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
193
+ ):
194
+ raise ValueError(
195
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
196
+ )
197
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
198
+ raise ValueError(
199
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
200
+ )
201
+ if not (
202
+ isinstance(rope_scaling_long_factor, list)
203
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
204
+ ):
205
+ raise ValueError(
206
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
207
+ )
208
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
209
+ raise ValueError(
210
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
211
+ )