mmaaz60 commited on
Commit
8312df0
1 Parent(s): c278255

Upload folder using huggingface_hub

Browse files
mlp2x_gelu_clip_l14_336px/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
mlp2x_gelu_clip_l14_336px/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ [![CODE](https://img.shields.io/badge/GitHub-Repository-<COLOR>)](https://github.com/mbzuai-oryx/LLaVA-pp)
6
+
7
+ # Phi-3-V: Extending the Visual Capabilities of LLaVA with Phi-3
8
+
9
+ ## Repository Overview
10
+
11
+ This repository features LLaVA v1.5 trained with the Phi-3-mini-3.8B LLM. This integration aims to leverage the strengths of both models to offer advanced vision-language understanding.
12
+
13
+ ## Training Strategy
14
+ - Only Vision-to-Language projector is trained. The rest of the model is frozen.
15
+ - **Note:** The repository contains only the projector weights.
16
+
17
+ ## Key Components
18
+
19
+ - **Base Large Language Model (LLM):** [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
20
+ - **Base Large Multimodal Model (LMM):** [LLaVA-v1.5](https://github.com/haotian-liu/LLaVA)
21
+
22
+ ## Training Data
23
+
24
+ - **Pretraining Dataset:** [LCS-558K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)
25
+
26
+ ## Download It As
27
+
28
+ ```
29
+ git lfs install
30
+ git clone https://huggingface.co/MBZUAI/LLaVA-Phi-3-mini-4k-instruct-pretrain
31
+ ```
32
+
33
+ ---
34
+
35
+ ## License
36
+
37
+ This project is available under the MIT License.
38
+
39
+ ## Contributions
40
+
41
+ Contributions are welcome! Please 🌟 our repository [LLaVA++](https://github.com/mbzuai-oryx/LLaVA-pp) if you find this model useful.
42
+
43
+ ---
mlp2x_gelu_clip_l14_336px/config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
3
+ "architectures": [
4
+ "Phi3ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_phi3.Phi3Config",
9
+ "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
10
+ },
11
+ "bos_token_id": 1,
12
+ "embd_pdrop": 0.0,
13
+ "eos_token_id": 32000,
14
+ "freeze_mm_mlp_adapter": false,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 3072,
17
+ "image_aspect_ratio": "square",
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 8192,
20
+ "max_position_embeddings": 4096,
21
+ "mm_hidden_size": 1024,
22
+ "mm_patch_merge_type": "flat",
23
+ "mm_projector_lr": null,
24
+ "mm_projector_type": "mlp2x_gelu",
25
+ "mm_use_im_patch_token": false,
26
+ "mm_use_im_start_end": false,
27
+ "mm_vision_select_feature": "patch",
28
+ "mm_vision_select_layer": -2,
29
+ "mm_vision_tower": "openai/clip-vit-large-patch14-336",
30
+ "model_type": "llava_phi",
31
+ "num_attention_heads": 32,
32
+ "num_hidden_layers": 32,
33
+ "num_key_value_heads": 32,
34
+ "original_max_position_embeddings": 4096,
35
+ "pad_token_id": 32000,
36
+ "resid_pdrop": 0.0,
37
+ "rms_norm_eps": 1e-05,
38
+ "rope_scaling": null,
39
+ "rope_theta": 10000.0,
40
+ "sliding_window": 2048,
41
+ "tie_word_embeddings": false,
42
+ "tokenizer_model_max_length": 2048,
43
+ "tokenizer_padding_side": "right",
44
+ "torch_dtype": "bfloat16",
45
+ "transformers_version": "4.41.0.dev0",
46
+ "tune_mm_mlp_adapter": true,
47
+ "use_cache": true,
48
+ "use_mm_proj": true,
49
+ "vocab_size": 32064
50
+ }
mlp2x_gelu_clip_l14_336px/mm_projector.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a20ebf4e815600a640fece57d8ec658d6323188d6a5273a8c1e10063a762f3e
3
+ size 25180216
mlp2x_gelu_clip_l14_336px/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
mlp2x_gelu_internvideo2/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
3
+ "architectures": "Phi3ForCausalLM",
4
+ "attention_dropout": 0.0,
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_phi3.Phi3Config",
7
+ "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
8
+ },
9
+ "bos_token_id": 1,
10
+ "config": {
11
+ "freeze": false,
12
+ "spatial_cluster_rate0": 64,
13
+ "spatial_cluster_rate1": 32,
14
+ "spatial_cluster_rate2": 16,
15
+ "temporal_cluster_rate": 0.0625,
16
+ "use_cluster": 0,
17
+ "vision_tune": false
18
+ },
19
+ "embd_pdrop": 0.0,
20
+ "eos_token_id": 32000,
21
+ "freeze_mm_mlp_adapter": false,
22
+ "hidden_act": "silu",
23
+ "hidden_size": 3072,
24
+ "image_aspect_ratio": "square",
25
+ "image_grid_pinpoints": null,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 8192,
28
+ "max_position_embeddings": 4096,
29
+ "mm_hidden_size": 1408,
30
+ "mm_projector_lr": null,
31
+ "mm_projector_type": "mlp2x_gelu",
32
+ "mm_use_box_start_end": false,
33
+ "mm_use_im_patch_token": false,
34
+ "mm_use_im_start_end": false,
35
+ "mm_vision_select_feature": "patch",
36
+ "mm_vision_select_layer": -2,
37
+ "mm_vision_tower": "OpenGVLab/InternVideo2-Stage2_1B-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
38
+ "model_type": "ChatUniVi",
39
+ "num_attention_heads": 32,
40
+ "num_hidden_layers": 32,
41
+ "num_key_value_heads": 32,
42
+ "original_max_position_embeddings": 4096,
43
+ "pad_token_id": 32000,
44
+ "resid_pdrop": 0.0,
45
+ "rms_norm_eps": 1e-05,
46
+ "rope_scaling": null,
47
+ "rope_theta": 10000.0,
48
+ "s2": false,
49
+ "s2_scales": "336,672,1008",
50
+ "sliding_window": 2048,
51
+ "tie_word_embeddings": false,
52
+ "torch_dtype": "bfloat16",
53
+ "transformers_version": "4.41.0.dev0",
54
+ "tune_mm_mlp_adapter": true,
55
+ "use_cache": true,
56
+ "use_mm_proj": true,
57
+ "vocab_size": 32064
58
+ }
mlp2x_gelu_internvideo2/mm_projector.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0decccbfce057197c5cee36bb9ea6a33b18d0c934353673b2d1fbc46d75e8305
3
+ size 2047616170