yonigozlan HF Staff commited on
Commit
4e99a94
·
verified ·
1 Parent(s): f3dedac

Upload InternVLForConditionalGeneration

Browse files
config.json CHANGED
@@ -37,6 +37,7 @@
37
  "use_sliding_window": false,
38
  "vocab_size": 151674
39
  },
 
40
  "transformers_version": "4.52.0.dev0",
41
  "vision_config": {
42
  "architectures": [
 
37
  "use_sliding_window": false,
38
  "vocab_size": 151674
39
  },
40
+ "torch_dtype": "bfloat16",
41
  "transformers_version": "4.52.0.dev0",
42
  "vision_config": {
43
  "architectures": [
model-00001-of-00016.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a77dfda0aafb9258be7fc631b56049d57b5f953ebd35196a6aa5b70061bdc510
3
- size 4988693808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c15466cb27d9e0f4b8cbe985e743cc30a648694ed904c965565f17ec82b14045
3
+ size 4988563328
model-00002-of-00016.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e55689b5e2a76724462140ab0c96b4b1e29f20940b56b36a4f3e590c9c5e0008
3
- size 4937410320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7840a85f648d99c34a38a7a0f94e093e6cb4fdc62901be146a5518000560ab2
3
+ size 4937273312
model-00003-of-00016.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:906dc1df4cf30022138b2cabb2cf371a58aae3eda040ff4b725ca06309036779
3
- size 4960249904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a83f8196f754901bb00a50a29deb2279cfe49db8a0d601468d1cf5bc18994a
3
+ size 4960223800
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 76776617984
4
  },
5
  "weight_map": {
6
  "language_model.lm_head.weight": "model-00016-of-00016.safetensors",
@@ -784,7 +784,6 @@
784
  "vision_tower.embeddings.patch_embeddings.projection.bias": "model-00001-of-00016.safetensors",
785
  "vision_tower.embeddings.patch_embeddings.projection.weight": "model-00001-of-00016.safetensors",
786
  "vision_tower.embeddings.position_embeddings": "model-00001-of-00016.safetensors",
787
- "vision_tower.encoder.layer.0.attention.k_norm.bias": "model-00001-of-00016.safetensors",
788
  "vision_tower.encoder.layer.0.attention.k_norm.weight": "model-00001-of-00016.safetensors",
789
  "vision_tower.encoder.layer.0.attention.k_proj.weight": "model-00001-of-00016.safetensors",
790
  "vision_tower.encoder.layer.0.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -800,7 +799,6 @@
800
  "vision_tower.encoder.layer.0.mlp.fc1.weight": "model-00001-of-00016.safetensors",
801
  "vision_tower.encoder.layer.0.mlp.fc2.bias": "model-00001-of-00016.safetensors",
802
  "vision_tower.encoder.layer.0.mlp.fc2.weight": "model-00001-of-00016.safetensors",
803
- "vision_tower.encoder.layer.1.attention.k_norm.bias": "model-00001-of-00016.safetensors",
804
  "vision_tower.encoder.layer.1.attention.k_norm.weight": "model-00001-of-00016.safetensors",
805
  "vision_tower.encoder.layer.1.attention.k_proj.weight": "model-00001-of-00016.safetensors",
806
  "vision_tower.encoder.layer.1.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -816,7 +814,6 @@
816
  "vision_tower.encoder.layer.1.mlp.fc1.weight": "model-00001-of-00016.safetensors",
817
  "vision_tower.encoder.layer.1.mlp.fc2.bias": "model-00001-of-00016.safetensors",
818
  "vision_tower.encoder.layer.1.mlp.fc2.weight": "model-00001-of-00016.safetensors",
819
- "vision_tower.encoder.layer.10.attention.k_norm.bias": "model-00001-of-00016.safetensors",
820
  "vision_tower.encoder.layer.10.attention.k_norm.weight": "model-00001-of-00016.safetensors",
821
  "vision_tower.encoder.layer.10.attention.k_proj.weight": "model-00001-of-00016.safetensors",
822
  "vision_tower.encoder.layer.10.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -832,7 +829,6 @@
832
  "vision_tower.encoder.layer.10.mlp.fc1.weight": "model-00001-of-00016.safetensors",
833
  "vision_tower.encoder.layer.10.mlp.fc2.bias": "model-00001-of-00016.safetensors",
834
  "vision_tower.encoder.layer.10.mlp.fc2.weight": "model-00001-of-00016.safetensors",
835
- "vision_tower.encoder.layer.11.attention.k_norm.bias": "model-00001-of-00016.safetensors",
836
  "vision_tower.encoder.layer.11.attention.k_norm.weight": "model-00001-of-00016.safetensors",
837
  "vision_tower.encoder.layer.11.attention.k_proj.weight": "model-00001-of-00016.safetensors",
838
  "vision_tower.encoder.layer.11.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -848,7 +844,6 @@
848
  "vision_tower.encoder.layer.11.mlp.fc1.weight": "model-00001-of-00016.safetensors",
849
  "vision_tower.encoder.layer.11.mlp.fc2.bias": "model-00001-of-00016.safetensors",
850
  "vision_tower.encoder.layer.11.mlp.fc2.weight": "model-00001-of-00016.safetensors",
851
- "vision_tower.encoder.layer.12.attention.k_norm.bias": "model-00001-of-00016.safetensors",
852
  "vision_tower.encoder.layer.12.attention.k_norm.weight": "model-00001-of-00016.safetensors",
853
  "vision_tower.encoder.layer.12.attention.k_proj.weight": "model-00001-of-00016.safetensors",
854
  "vision_tower.encoder.layer.12.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -864,7 +859,6 @@
864
  "vision_tower.encoder.layer.12.mlp.fc1.weight": "model-00001-of-00016.safetensors",
865
  "vision_tower.encoder.layer.12.mlp.fc2.bias": "model-00001-of-00016.safetensors",
866
  "vision_tower.encoder.layer.12.mlp.fc2.weight": "model-00001-of-00016.safetensors",
867
- "vision_tower.encoder.layer.13.attention.k_norm.bias": "model-00001-of-00016.safetensors",
868
  "vision_tower.encoder.layer.13.attention.k_norm.weight": "model-00001-of-00016.safetensors",
869
  "vision_tower.encoder.layer.13.attention.k_proj.weight": "model-00001-of-00016.safetensors",
870
  "vision_tower.encoder.layer.13.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -880,7 +874,6 @@
880
  "vision_tower.encoder.layer.13.mlp.fc1.weight": "model-00001-of-00016.safetensors",
881
  "vision_tower.encoder.layer.13.mlp.fc2.bias": "model-00001-of-00016.safetensors",
882
  "vision_tower.encoder.layer.13.mlp.fc2.weight": "model-00001-of-00016.safetensors",
883
- "vision_tower.encoder.layer.14.attention.k_norm.bias": "model-00001-of-00016.safetensors",
884
  "vision_tower.encoder.layer.14.attention.k_norm.weight": "model-00001-of-00016.safetensors",
885
  "vision_tower.encoder.layer.14.attention.k_proj.weight": "model-00001-of-00016.safetensors",
886
  "vision_tower.encoder.layer.14.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -896,7 +889,6 @@
896
  "vision_tower.encoder.layer.14.mlp.fc1.weight": "model-00001-of-00016.safetensors",
897
  "vision_tower.encoder.layer.14.mlp.fc2.bias": "model-00001-of-00016.safetensors",
898
  "vision_tower.encoder.layer.14.mlp.fc2.weight": "model-00001-of-00016.safetensors",
899
- "vision_tower.encoder.layer.15.attention.k_norm.bias": "model-00001-of-00016.safetensors",
900
  "vision_tower.encoder.layer.15.attention.k_norm.weight": "model-00001-of-00016.safetensors",
901
  "vision_tower.encoder.layer.15.attention.k_proj.weight": "model-00001-of-00016.safetensors",
902
  "vision_tower.encoder.layer.15.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -912,7 +904,6 @@
912
  "vision_tower.encoder.layer.15.mlp.fc1.weight": "model-00001-of-00016.safetensors",
913
  "vision_tower.encoder.layer.15.mlp.fc2.bias": "model-00001-of-00016.safetensors",
914
  "vision_tower.encoder.layer.15.mlp.fc2.weight": "model-00001-of-00016.safetensors",
915
- "vision_tower.encoder.layer.16.attention.k_norm.bias": "model-00001-of-00016.safetensors",
916
  "vision_tower.encoder.layer.16.attention.k_norm.weight": "model-00001-of-00016.safetensors",
917
  "vision_tower.encoder.layer.16.attention.k_proj.weight": "model-00001-of-00016.safetensors",
918
  "vision_tower.encoder.layer.16.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -928,7 +919,6 @@
928
  "vision_tower.encoder.layer.16.mlp.fc1.weight": "model-00001-of-00016.safetensors",
929
  "vision_tower.encoder.layer.16.mlp.fc2.bias": "model-00001-of-00016.safetensors",
930
  "vision_tower.encoder.layer.16.mlp.fc2.weight": "model-00001-of-00016.safetensors",
931
- "vision_tower.encoder.layer.17.attention.k_norm.bias": "model-00001-of-00016.safetensors",
932
  "vision_tower.encoder.layer.17.attention.k_norm.weight": "model-00001-of-00016.safetensors",
933
  "vision_tower.encoder.layer.17.attention.k_proj.weight": "model-00001-of-00016.safetensors",
934
  "vision_tower.encoder.layer.17.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -944,7 +934,6 @@
944
  "vision_tower.encoder.layer.17.mlp.fc1.weight": "model-00001-of-00016.safetensors",
945
  "vision_tower.encoder.layer.17.mlp.fc2.bias": "model-00001-of-00016.safetensors",
946
  "vision_tower.encoder.layer.17.mlp.fc2.weight": "model-00001-of-00016.safetensors",
947
- "vision_tower.encoder.layer.18.attention.k_norm.bias": "model-00001-of-00016.safetensors",
948
  "vision_tower.encoder.layer.18.attention.k_norm.weight": "model-00001-of-00016.safetensors",
949
  "vision_tower.encoder.layer.18.attention.k_proj.weight": "model-00001-of-00016.safetensors",
950
  "vision_tower.encoder.layer.18.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -960,7 +949,6 @@
960
  "vision_tower.encoder.layer.18.mlp.fc1.weight": "model-00001-of-00016.safetensors",
961
  "vision_tower.encoder.layer.18.mlp.fc2.bias": "model-00001-of-00016.safetensors",
962
  "vision_tower.encoder.layer.18.mlp.fc2.weight": "model-00001-of-00016.safetensors",
963
- "vision_tower.encoder.layer.19.attention.k_norm.bias": "model-00001-of-00016.safetensors",
964
  "vision_tower.encoder.layer.19.attention.k_norm.weight": "model-00001-of-00016.safetensors",
965
  "vision_tower.encoder.layer.19.attention.k_proj.weight": "model-00001-of-00016.safetensors",
966
  "vision_tower.encoder.layer.19.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -976,7 +964,6 @@
976
  "vision_tower.encoder.layer.19.mlp.fc1.weight": "model-00001-of-00016.safetensors",
977
  "vision_tower.encoder.layer.19.mlp.fc2.bias": "model-00001-of-00016.safetensors",
978
  "vision_tower.encoder.layer.19.mlp.fc2.weight": "model-00001-of-00016.safetensors",
979
- "vision_tower.encoder.layer.2.attention.k_norm.bias": "model-00001-of-00016.safetensors",
980
  "vision_tower.encoder.layer.2.attention.k_norm.weight": "model-00001-of-00016.safetensors",
981
  "vision_tower.encoder.layer.2.attention.k_proj.weight": "model-00001-of-00016.safetensors",
982
  "vision_tower.encoder.layer.2.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -992,7 +979,6 @@
992
  "vision_tower.encoder.layer.2.mlp.fc1.weight": "model-00001-of-00016.safetensors",
993
  "vision_tower.encoder.layer.2.mlp.fc2.bias": "model-00001-of-00016.safetensors",
994
  "vision_tower.encoder.layer.2.mlp.fc2.weight": "model-00001-of-00016.safetensors",
995
- "vision_tower.encoder.layer.20.attention.k_norm.bias": "model-00002-of-00016.safetensors",
996
  "vision_tower.encoder.layer.20.attention.k_norm.weight": "model-00002-of-00016.safetensors",
997
  "vision_tower.encoder.layer.20.attention.k_proj.weight": "model-00001-of-00016.safetensors",
998
  "vision_tower.encoder.layer.20.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1008,7 +994,6 @@
1008
  "vision_tower.encoder.layer.20.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1009
  "vision_tower.encoder.layer.20.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1010
  "vision_tower.encoder.layer.20.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1011
- "vision_tower.encoder.layer.21.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1012
  "vision_tower.encoder.layer.21.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1013
  "vision_tower.encoder.layer.21.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1014
  "vision_tower.encoder.layer.21.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1024,7 +1009,6 @@
1024
  "vision_tower.encoder.layer.21.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1025
  "vision_tower.encoder.layer.21.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1026
  "vision_tower.encoder.layer.21.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1027
- "vision_tower.encoder.layer.22.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1028
  "vision_tower.encoder.layer.22.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1029
  "vision_tower.encoder.layer.22.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1030
  "vision_tower.encoder.layer.22.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1040,7 +1024,6 @@
1040
  "vision_tower.encoder.layer.22.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1041
  "vision_tower.encoder.layer.22.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1042
  "vision_tower.encoder.layer.22.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1043
- "vision_tower.encoder.layer.23.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1044
  "vision_tower.encoder.layer.23.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1045
  "vision_tower.encoder.layer.23.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1046
  "vision_tower.encoder.layer.23.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1056,7 +1039,6 @@
1056
  "vision_tower.encoder.layer.23.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1057
  "vision_tower.encoder.layer.23.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1058
  "vision_tower.encoder.layer.23.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1059
- "vision_tower.encoder.layer.24.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1060
  "vision_tower.encoder.layer.24.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1061
  "vision_tower.encoder.layer.24.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1062
  "vision_tower.encoder.layer.24.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1072,7 +1054,6 @@
1072
  "vision_tower.encoder.layer.24.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1073
  "vision_tower.encoder.layer.24.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1074
  "vision_tower.encoder.layer.24.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1075
- "vision_tower.encoder.layer.25.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1076
  "vision_tower.encoder.layer.25.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1077
  "vision_tower.encoder.layer.25.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1078
  "vision_tower.encoder.layer.25.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1088,7 +1069,6 @@
1088
  "vision_tower.encoder.layer.25.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1089
  "vision_tower.encoder.layer.25.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1090
  "vision_tower.encoder.layer.25.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1091
- "vision_tower.encoder.layer.26.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1092
  "vision_tower.encoder.layer.26.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1093
  "vision_tower.encoder.layer.26.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1094
  "vision_tower.encoder.layer.26.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1104,7 +1084,6 @@
1104
  "vision_tower.encoder.layer.26.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1105
  "vision_tower.encoder.layer.26.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1106
  "vision_tower.encoder.layer.26.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1107
- "vision_tower.encoder.layer.27.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1108
  "vision_tower.encoder.layer.27.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1109
  "vision_tower.encoder.layer.27.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1110
  "vision_tower.encoder.layer.27.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1120,7 +1099,6 @@
1120
  "vision_tower.encoder.layer.27.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1121
  "vision_tower.encoder.layer.27.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1122
  "vision_tower.encoder.layer.27.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1123
- "vision_tower.encoder.layer.28.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1124
  "vision_tower.encoder.layer.28.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1125
  "vision_tower.encoder.layer.28.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1126
  "vision_tower.encoder.layer.28.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1136,7 +1114,6 @@
1136
  "vision_tower.encoder.layer.28.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1137
  "vision_tower.encoder.layer.28.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1138
  "vision_tower.encoder.layer.28.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1139
- "vision_tower.encoder.layer.29.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1140
  "vision_tower.encoder.layer.29.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1141
  "vision_tower.encoder.layer.29.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1142
  "vision_tower.encoder.layer.29.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1152,7 +1129,6 @@
1152
  "vision_tower.encoder.layer.29.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1153
  "vision_tower.encoder.layer.29.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1154
  "vision_tower.encoder.layer.29.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1155
- "vision_tower.encoder.layer.3.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1156
  "vision_tower.encoder.layer.3.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1157
  "vision_tower.encoder.layer.3.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1158
  "vision_tower.encoder.layer.3.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -1168,7 +1144,6 @@
1168
  "vision_tower.encoder.layer.3.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1169
  "vision_tower.encoder.layer.3.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1170
  "vision_tower.encoder.layer.3.mlp.fc2.weight": "model-00001-of-00016.safetensors",
1171
- "vision_tower.encoder.layer.30.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1172
  "vision_tower.encoder.layer.30.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1173
  "vision_tower.encoder.layer.30.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1174
  "vision_tower.encoder.layer.30.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1184,7 +1159,6 @@
1184
  "vision_tower.encoder.layer.30.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1185
  "vision_tower.encoder.layer.30.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1186
  "vision_tower.encoder.layer.30.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1187
- "vision_tower.encoder.layer.31.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1188
  "vision_tower.encoder.layer.31.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1189
  "vision_tower.encoder.layer.31.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1190
  "vision_tower.encoder.layer.31.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1200,7 +1174,6 @@
1200
  "vision_tower.encoder.layer.31.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1201
  "vision_tower.encoder.layer.31.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1202
  "vision_tower.encoder.layer.31.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1203
- "vision_tower.encoder.layer.32.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1204
  "vision_tower.encoder.layer.32.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1205
  "vision_tower.encoder.layer.32.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1206
  "vision_tower.encoder.layer.32.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1216,7 +1189,6 @@
1216
  "vision_tower.encoder.layer.32.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1217
  "vision_tower.encoder.layer.32.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1218
  "vision_tower.encoder.layer.32.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1219
- "vision_tower.encoder.layer.33.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1220
  "vision_tower.encoder.layer.33.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1221
  "vision_tower.encoder.layer.33.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1222
  "vision_tower.encoder.layer.33.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1232,7 +1204,6 @@
1232
  "vision_tower.encoder.layer.33.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1233
  "vision_tower.encoder.layer.33.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1234
  "vision_tower.encoder.layer.33.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1235
- "vision_tower.encoder.layer.34.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1236
  "vision_tower.encoder.layer.34.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1237
  "vision_tower.encoder.layer.34.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1238
  "vision_tower.encoder.layer.34.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1248,7 +1219,6 @@
1248
  "vision_tower.encoder.layer.34.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1249
  "vision_tower.encoder.layer.34.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1250
  "vision_tower.encoder.layer.34.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1251
- "vision_tower.encoder.layer.35.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1252
  "vision_tower.encoder.layer.35.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1253
  "vision_tower.encoder.layer.35.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1254
  "vision_tower.encoder.layer.35.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1264,7 +1234,6 @@
1264
  "vision_tower.encoder.layer.35.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1265
  "vision_tower.encoder.layer.35.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1266
  "vision_tower.encoder.layer.35.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1267
- "vision_tower.encoder.layer.36.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1268
  "vision_tower.encoder.layer.36.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1269
  "vision_tower.encoder.layer.36.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1270
  "vision_tower.encoder.layer.36.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1280,7 +1249,6 @@
1280
  "vision_tower.encoder.layer.36.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1281
  "vision_tower.encoder.layer.36.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1282
  "vision_tower.encoder.layer.36.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1283
- "vision_tower.encoder.layer.37.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1284
  "vision_tower.encoder.layer.37.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1285
  "vision_tower.encoder.layer.37.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1286
  "vision_tower.encoder.layer.37.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1296,7 +1264,6 @@
1296
  "vision_tower.encoder.layer.37.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1297
  "vision_tower.encoder.layer.37.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1298
  "vision_tower.encoder.layer.37.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1299
- "vision_tower.encoder.layer.38.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1300
  "vision_tower.encoder.layer.38.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1301
  "vision_tower.encoder.layer.38.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1302
  "vision_tower.encoder.layer.38.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1312,7 +1279,6 @@
1312
  "vision_tower.encoder.layer.38.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1313
  "vision_tower.encoder.layer.38.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1314
  "vision_tower.encoder.layer.38.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1315
- "vision_tower.encoder.layer.39.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1316
  "vision_tower.encoder.layer.39.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1317
  "vision_tower.encoder.layer.39.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1318
  "vision_tower.encoder.layer.39.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1328,7 +1294,6 @@
1328
  "vision_tower.encoder.layer.39.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1329
  "vision_tower.encoder.layer.39.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1330
  "vision_tower.encoder.layer.39.mlp.fc2.weight": "model-00002-of-00016.safetensors",
1331
- "vision_tower.encoder.layer.4.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1332
  "vision_tower.encoder.layer.4.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1333
  "vision_tower.encoder.layer.4.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1334
  "vision_tower.encoder.layer.4.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -1344,7 +1309,6 @@
1344
  "vision_tower.encoder.layer.4.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1345
  "vision_tower.encoder.layer.4.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1346
  "vision_tower.encoder.layer.4.mlp.fc2.weight": "model-00001-of-00016.safetensors",
1347
- "vision_tower.encoder.layer.40.attention.k_norm.bias": "model-00002-of-00016.safetensors",
1348
  "vision_tower.encoder.layer.40.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1349
  "vision_tower.encoder.layer.40.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1350
  "vision_tower.encoder.layer.40.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
@@ -1360,7 +1324,6 @@
1360
  "vision_tower.encoder.layer.40.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1361
  "vision_tower.encoder.layer.40.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1362
  "vision_tower.encoder.layer.40.mlp.fc2.weight": "model-00003-of-00016.safetensors",
1363
- "vision_tower.encoder.layer.41.attention.k_norm.bias": "model-00003-of-00016.safetensors",
1364
  "vision_tower.encoder.layer.41.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1365
  "vision_tower.encoder.layer.41.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1366
  "vision_tower.encoder.layer.41.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
@@ -1376,7 +1339,6 @@
1376
  "vision_tower.encoder.layer.41.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1377
  "vision_tower.encoder.layer.41.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1378
  "vision_tower.encoder.layer.41.mlp.fc2.weight": "model-00003-of-00016.safetensors",
1379
- "vision_tower.encoder.layer.42.attention.k_norm.bias": "model-00003-of-00016.safetensors",
1380
  "vision_tower.encoder.layer.42.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1381
  "vision_tower.encoder.layer.42.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1382
  "vision_tower.encoder.layer.42.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
@@ -1392,7 +1354,6 @@
1392
  "vision_tower.encoder.layer.42.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1393
  "vision_tower.encoder.layer.42.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1394
  "vision_tower.encoder.layer.42.mlp.fc2.weight": "model-00003-of-00016.safetensors",
1395
- "vision_tower.encoder.layer.43.attention.k_norm.bias": "model-00003-of-00016.safetensors",
1396
  "vision_tower.encoder.layer.43.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1397
  "vision_tower.encoder.layer.43.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1398
  "vision_tower.encoder.layer.43.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
@@ -1408,7 +1369,6 @@
1408
  "vision_tower.encoder.layer.43.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1409
  "vision_tower.encoder.layer.43.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1410
  "vision_tower.encoder.layer.43.mlp.fc2.weight": "model-00003-of-00016.safetensors",
1411
- "vision_tower.encoder.layer.44.attention.k_norm.bias": "model-00003-of-00016.safetensors",
1412
  "vision_tower.encoder.layer.44.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1413
  "vision_tower.encoder.layer.44.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1414
  "vision_tower.encoder.layer.44.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
@@ -1424,7 +1384,6 @@
1424
  "vision_tower.encoder.layer.44.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1425
  "vision_tower.encoder.layer.44.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1426
  "vision_tower.encoder.layer.44.mlp.fc2.weight": "model-00003-of-00016.safetensors",
1427
- "vision_tower.encoder.layer.5.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1428
  "vision_tower.encoder.layer.5.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1429
  "vision_tower.encoder.layer.5.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1430
  "vision_tower.encoder.layer.5.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -1440,7 +1399,6 @@
1440
  "vision_tower.encoder.layer.5.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1441
  "vision_tower.encoder.layer.5.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1442
  "vision_tower.encoder.layer.5.mlp.fc2.weight": "model-00001-of-00016.safetensors",
1443
- "vision_tower.encoder.layer.6.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1444
  "vision_tower.encoder.layer.6.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1445
  "vision_tower.encoder.layer.6.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1446
  "vision_tower.encoder.layer.6.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -1456,7 +1414,6 @@
1456
  "vision_tower.encoder.layer.6.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1457
  "vision_tower.encoder.layer.6.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1458
  "vision_tower.encoder.layer.6.mlp.fc2.weight": "model-00001-of-00016.safetensors",
1459
- "vision_tower.encoder.layer.7.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1460
  "vision_tower.encoder.layer.7.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1461
  "vision_tower.encoder.layer.7.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1462
  "vision_tower.encoder.layer.7.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -1472,7 +1429,6 @@
1472
  "vision_tower.encoder.layer.7.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1473
  "vision_tower.encoder.layer.7.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1474
  "vision_tower.encoder.layer.7.mlp.fc2.weight": "model-00001-of-00016.safetensors",
1475
- "vision_tower.encoder.layer.8.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1476
  "vision_tower.encoder.layer.8.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1477
  "vision_tower.encoder.layer.8.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1478
  "vision_tower.encoder.layer.8.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
@@ -1488,7 +1444,6 @@
1488
  "vision_tower.encoder.layer.8.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1489
  "vision_tower.encoder.layer.8.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1490
  "vision_tower.encoder.layer.8.mlp.fc2.weight": "model-00001-of-00016.safetensors",
1491
- "vision_tower.encoder.layer.9.attention.k_norm.bias": "model-00001-of-00016.safetensors",
1492
  "vision_tower.encoder.layer.9.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1493
  "vision_tower.encoder.layer.9.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1494
  "vision_tower.encoder.layer.9.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 76776329984
4
  },
5
  "weight_map": {
6
  "language_model.lm_head.weight": "model-00016-of-00016.safetensors",
 
784
  "vision_tower.embeddings.patch_embeddings.projection.bias": "model-00001-of-00016.safetensors",
785
  "vision_tower.embeddings.patch_embeddings.projection.weight": "model-00001-of-00016.safetensors",
786
  "vision_tower.embeddings.position_embeddings": "model-00001-of-00016.safetensors",
 
787
  "vision_tower.encoder.layer.0.attention.k_norm.weight": "model-00001-of-00016.safetensors",
788
  "vision_tower.encoder.layer.0.attention.k_proj.weight": "model-00001-of-00016.safetensors",
789
  "vision_tower.encoder.layer.0.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
799
  "vision_tower.encoder.layer.0.mlp.fc1.weight": "model-00001-of-00016.safetensors",
800
  "vision_tower.encoder.layer.0.mlp.fc2.bias": "model-00001-of-00016.safetensors",
801
  "vision_tower.encoder.layer.0.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
802
  "vision_tower.encoder.layer.1.attention.k_norm.weight": "model-00001-of-00016.safetensors",
803
  "vision_tower.encoder.layer.1.attention.k_proj.weight": "model-00001-of-00016.safetensors",
804
  "vision_tower.encoder.layer.1.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
814
  "vision_tower.encoder.layer.1.mlp.fc1.weight": "model-00001-of-00016.safetensors",
815
  "vision_tower.encoder.layer.1.mlp.fc2.bias": "model-00001-of-00016.safetensors",
816
  "vision_tower.encoder.layer.1.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
817
  "vision_tower.encoder.layer.10.attention.k_norm.weight": "model-00001-of-00016.safetensors",
818
  "vision_tower.encoder.layer.10.attention.k_proj.weight": "model-00001-of-00016.safetensors",
819
  "vision_tower.encoder.layer.10.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
829
  "vision_tower.encoder.layer.10.mlp.fc1.weight": "model-00001-of-00016.safetensors",
830
  "vision_tower.encoder.layer.10.mlp.fc2.bias": "model-00001-of-00016.safetensors",
831
  "vision_tower.encoder.layer.10.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
832
  "vision_tower.encoder.layer.11.attention.k_norm.weight": "model-00001-of-00016.safetensors",
833
  "vision_tower.encoder.layer.11.attention.k_proj.weight": "model-00001-of-00016.safetensors",
834
  "vision_tower.encoder.layer.11.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
844
  "vision_tower.encoder.layer.11.mlp.fc1.weight": "model-00001-of-00016.safetensors",
845
  "vision_tower.encoder.layer.11.mlp.fc2.bias": "model-00001-of-00016.safetensors",
846
  "vision_tower.encoder.layer.11.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
847
  "vision_tower.encoder.layer.12.attention.k_norm.weight": "model-00001-of-00016.safetensors",
848
  "vision_tower.encoder.layer.12.attention.k_proj.weight": "model-00001-of-00016.safetensors",
849
  "vision_tower.encoder.layer.12.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
859
  "vision_tower.encoder.layer.12.mlp.fc1.weight": "model-00001-of-00016.safetensors",
860
  "vision_tower.encoder.layer.12.mlp.fc2.bias": "model-00001-of-00016.safetensors",
861
  "vision_tower.encoder.layer.12.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
862
  "vision_tower.encoder.layer.13.attention.k_norm.weight": "model-00001-of-00016.safetensors",
863
  "vision_tower.encoder.layer.13.attention.k_proj.weight": "model-00001-of-00016.safetensors",
864
  "vision_tower.encoder.layer.13.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
874
  "vision_tower.encoder.layer.13.mlp.fc1.weight": "model-00001-of-00016.safetensors",
875
  "vision_tower.encoder.layer.13.mlp.fc2.bias": "model-00001-of-00016.safetensors",
876
  "vision_tower.encoder.layer.13.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
877
  "vision_tower.encoder.layer.14.attention.k_norm.weight": "model-00001-of-00016.safetensors",
878
  "vision_tower.encoder.layer.14.attention.k_proj.weight": "model-00001-of-00016.safetensors",
879
  "vision_tower.encoder.layer.14.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
889
  "vision_tower.encoder.layer.14.mlp.fc1.weight": "model-00001-of-00016.safetensors",
890
  "vision_tower.encoder.layer.14.mlp.fc2.bias": "model-00001-of-00016.safetensors",
891
  "vision_tower.encoder.layer.14.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
892
  "vision_tower.encoder.layer.15.attention.k_norm.weight": "model-00001-of-00016.safetensors",
893
  "vision_tower.encoder.layer.15.attention.k_proj.weight": "model-00001-of-00016.safetensors",
894
  "vision_tower.encoder.layer.15.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
904
  "vision_tower.encoder.layer.15.mlp.fc1.weight": "model-00001-of-00016.safetensors",
905
  "vision_tower.encoder.layer.15.mlp.fc2.bias": "model-00001-of-00016.safetensors",
906
  "vision_tower.encoder.layer.15.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
907
  "vision_tower.encoder.layer.16.attention.k_norm.weight": "model-00001-of-00016.safetensors",
908
  "vision_tower.encoder.layer.16.attention.k_proj.weight": "model-00001-of-00016.safetensors",
909
  "vision_tower.encoder.layer.16.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
919
  "vision_tower.encoder.layer.16.mlp.fc1.weight": "model-00001-of-00016.safetensors",
920
  "vision_tower.encoder.layer.16.mlp.fc2.bias": "model-00001-of-00016.safetensors",
921
  "vision_tower.encoder.layer.16.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
922
  "vision_tower.encoder.layer.17.attention.k_norm.weight": "model-00001-of-00016.safetensors",
923
  "vision_tower.encoder.layer.17.attention.k_proj.weight": "model-00001-of-00016.safetensors",
924
  "vision_tower.encoder.layer.17.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
934
  "vision_tower.encoder.layer.17.mlp.fc1.weight": "model-00001-of-00016.safetensors",
935
  "vision_tower.encoder.layer.17.mlp.fc2.bias": "model-00001-of-00016.safetensors",
936
  "vision_tower.encoder.layer.17.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
937
  "vision_tower.encoder.layer.18.attention.k_norm.weight": "model-00001-of-00016.safetensors",
938
  "vision_tower.encoder.layer.18.attention.k_proj.weight": "model-00001-of-00016.safetensors",
939
  "vision_tower.encoder.layer.18.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
949
  "vision_tower.encoder.layer.18.mlp.fc1.weight": "model-00001-of-00016.safetensors",
950
  "vision_tower.encoder.layer.18.mlp.fc2.bias": "model-00001-of-00016.safetensors",
951
  "vision_tower.encoder.layer.18.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
952
  "vision_tower.encoder.layer.19.attention.k_norm.weight": "model-00001-of-00016.safetensors",
953
  "vision_tower.encoder.layer.19.attention.k_proj.weight": "model-00001-of-00016.safetensors",
954
  "vision_tower.encoder.layer.19.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
964
  "vision_tower.encoder.layer.19.mlp.fc1.weight": "model-00001-of-00016.safetensors",
965
  "vision_tower.encoder.layer.19.mlp.fc2.bias": "model-00001-of-00016.safetensors",
966
  "vision_tower.encoder.layer.19.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
967
  "vision_tower.encoder.layer.2.attention.k_norm.weight": "model-00001-of-00016.safetensors",
968
  "vision_tower.encoder.layer.2.attention.k_proj.weight": "model-00001-of-00016.safetensors",
969
  "vision_tower.encoder.layer.2.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
979
  "vision_tower.encoder.layer.2.mlp.fc1.weight": "model-00001-of-00016.safetensors",
980
  "vision_tower.encoder.layer.2.mlp.fc2.bias": "model-00001-of-00016.safetensors",
981
  "vision_tower.encoder.layer.2.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
982
  "vision_tower.encoder.layer.20.attention.k_norm.weight": "model-00002-of-00016.safetensors",
983
  "vision_tower.encoder.layer.20.attention.k_proj.weight": "model-00001-of-00016.safetensors",
984
  "vision_tower.encoder.layer.20.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
994
  "vision_tower.encoder.layer.20.mlp.fc1.weight": "model-00002-of-00016.safetensors",
995
  "vision_tower.encoder.layer.20.mlp.fc2.bias": "model-00002-of-00016.safetensors",
996
  "vision_tower.encoder.layer.20.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
997
  "vision_tower.encoder.layer.21.attention.k_norm.weight": "model-00002-of-00016.safetensors",
998
  "vision_tower.encoder.layer.21.attention.k_proj.weight": "model-00002-of-00016.safetensors",
999
  "vision_tower.encoder.layer.21.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1009
  "vision_tower.encoder.layer.21.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1010
  "vision_tower.encoder.layer.21.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1011
  "vision_tower.encoder.layer.21.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1012
  "vision_tower.encoder.layer.22.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1013
  "vision_tower.encoder.layer.22.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1014
  "vision_tower.encoder.layer.22.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1024
  "vision_tower.encoder.layer.22.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1025
  "vision_tower.encoder.layer.22.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1026
  "vision_tower.encoder.layer.22.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1027
  "vision_tower.encoder.layer.23.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1028
  "vision_tower.encoder.layer.23.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1029
  "vision_tower.encoder.layer.23.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1039
  "vision_tower.encoder.layer.23.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1040
  "vision_tower.encoder.layer.23.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1041
  "vision_tower.encoder.layer.23.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1042
  "vision_tower.encoder.layer.24.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1043
  "vision_tower.encoder.layer.24.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1044
  "vision_tower.encoder.layer.24.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1054
  "vision_tower.encoder.layer.24.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1055
  "vision_tower.encoder.layer.24.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1056
  "vision_tower.encoder.layer.24.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1057
  "vision_tower.encoder.layer.25.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1058
  "vision_tower.encoder.layer.25.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1059
  "vision_tower.encoder.layer.25.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1069
  "vision_tower.encoder.layer.25.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1070
  "vision_tower.encoder.layer.25.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1071
  "vision_tower.encoder.layer.25.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1072
  "vision_tower.encoder.layer.26.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1073
  "vision_tower.encoder.layer.26.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1074
  "vision_tower.encoder.layer.26.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1084
  "vision_tower.encoder.layer.26.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1085
  "vision_tower.encoder.layer.26.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1086
  "vision_tower.encoder.layer.26.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1087
  "vision_tower.encoder.layer.27.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1088
  "vision_tower.encoder.layer.27.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1089
  "vision_tower.encoder.layer.27.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1099
  "vision_tower.encoder.layer.27.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1100
  "vision_tower.encoder.layer.27.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1101
  "vision_tower.encoder.layer.27.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1102
  "vision_tower.encoder.layer.28.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1103
  "vision_tower.encoder.layer.28.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1104
  "vision_tower.encoder.layer.28.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1114
  "vision_tower.encoder.layer.28.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1115
  "vision_tower.encoder.layer.28.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1116
  "vision_tower.encoder.layer.28.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1117
  "vision_tower.encoder.layer.29.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1118
  "vision_tower.encoder.layer.29.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1119
  "vision_tower.encoder.layer.29.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1129
  "vision_tower.encoder.layer.29.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1130
  "vision_tower.encoder.layer.29.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1131
  "vision_tower.encoder.layer.29.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1132
  "vision_tower.encoder.layer.3.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1133
  "vision_tower.encoder.layer.3.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1134
  "vision_tower.encoder.layer.3.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1144
  "vision_tower.encoder.layer.3.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1145
  "vision_tower.encoder.layer.3.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1146
  "vision_tower.encoder.layer.3.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
1147
  "vision_tower.encoder.layer.30.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1148
  "vision_tower.encoder.layer.30.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1149
  "vision_tower.encoder.layer.30.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1159
  "vision_tower.encoder.layer.30.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1160
  "vision_tower.encoder.layer.30.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1161
  "vision_tower.encoder.layer.30.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1162
  "vision_tower.encoder.layer.31.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1163
  "vision_tower.encoder.layer.31.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1164
  "vision_tower.encoder.layer.31.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1174
  "vision_tower.encoder.layer.31.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1175
  "vision_tower.encoder.layer.31.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1176
  "vision_tower.encoder.layer.31.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1177
  "vision_tower.encoder.layer.32.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1178
  "vision_tower.encoder.layer.32.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1179
  "vision_tower.encoder.layer.32.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1189
  "vision_tower.encoder.layer.32.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1190
  "vision_tower.encoder.layer.32.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1191
  "vision_tower.encoder.layer.32.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1192
  "vision_tower.encoder.layer.33.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1193
  "vision_tower.encoder.layer.33.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1194
  "vision_tower.encoder.layer.33.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1204
  "vision_tower.encoder.layer.33.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1205
  "vision_tower.encoder.layer.33.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1206
  "vision_tower.encoder.layer.33.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1207
  "vision_tower.encoder.layer.34.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1208
  "vision_tower.encoder.layer.34.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1209
  "vision_tower.encoder.layer.34.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1219
  "vision_tower.encoder.layer.34.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1220
  "vision_tower.encoder.layer.34.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1221
  "vision_tower.encoder.layer.34.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1222
  "vision_tower.encoder.layer.35.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1223
  "vision_tower.encoder.layer.35.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1224
  "vision_tower.encoder.layer.35.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1234
  "vision_tower.encoder.layer.35.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1235
  "vision_tower.encoder.layer.35.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1236
  "vision_tower.encoder.layer.35.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1237
  "vision_tower.encoder.layer.36.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1238
  "vision_tower.encoder.layer.36.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1239
  "vision_tower.encoder.layer.36.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1249
  "vision_tower.encoder.layer.36.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1250
  "vision_tower.encoder.layer.36.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1251
  "vision_tower.encoder.layer.36.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1252
  "vision_tower.encoder.layer.37.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1253
  "vision_tower.encoder.layer.37.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1254
  "vision_tower.encoder.layer.37.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1264
  "vision_tower.encoder.layer.37.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1265
  "vision_tower.encoder.layer.37.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1266
  "vision_tower.encoder.layer.37.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1267
  "vision_tower.encoder.layer.38.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1268
  "vision_tower.encoder.layer.38.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1269
  "vision_tower.encoder.layer.38.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1279
  "vision_tower.encoder.layer.38.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1280
  "vision_tower.encoder.layer.38.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1281
  "vision_tower.encoder.layer.38.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1282
  "vision_tower.encoder.layer.39.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1283
  "vision_tower.encoder.layer.39.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1284
  "vision_tower.encoder.layer.39.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1294
  "vision_tower.encoder.layer.39.mlp.fc1.weight": "model-00002-of-00016.safetensors",
1295
  "vision_tower.encoder.layer.39.mlp.fc2.bias": "model-00002-of-00016.safetensors",
1296
  "vision_tower.encoder.layer.39.mlp.fc2.weight": "model-00002-of-00016.safetensors",
 
1297
  "vision_tower.encoder.layer.4.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1298
  "vision_tower.encoder.layer.4.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1299
  "vision_tower.encoder.layer.4.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1309
  "vision_tower.encoder.layer.4.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1310
  "vision_tower.encoder.layer.4.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1311
  "vision_tower.encoder.layer.4.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
1312
  "vision_tower.encoder.layer.40.attention.k_norm.weight": "model-00002-of-00016.safetensors",
1313
  "vision_tower.encoder.layer.40.attention.k_proj.weight": "model-00002-of-00016.safetensors",
1314
  "vision_tower.encoder.layer.40.attention.projection_layer.bias": "model-00002-of-00016.safetensors",
 
1324
  "vision_tower.encoder.layer.40.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1325
  "vision_tower.encoder.layer.40.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1326
  "vision_tower.encoder.layer.40.mlp.fc2.weight": "model-00003-of-00016.safetensors",
 
1327
  "vision_tower.encoder.layer.41.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1328
  "vision_tower.encoder.layer.41.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1329
  "vision_tower.encoder.layer.41.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
 
1339
  "vision_tower.encoder.layer.41.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1340
  "vision_tower.encoder.layer.41.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1341
  "vision_tower.encoder.layer.41.mlp.fc2.weight": "model-00003-of-00016.safetensors",
 
1342
  "vision_tower.encoder.layer.42.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1343
  "vision_tower.encoder.layer.42.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1344
  "vision_tower.encoder.layer.42.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
 
1354
  "vision_tower.encoder.layer.42.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1355
  "vision_tower.encoder.layer.42.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1356
  "vision_tower.encoder.layer.42.mlp.fc2.weight": "model-00003-of-00016.safetensors",
 
1357
  "vision_tower.encoder.layer.43.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1358
  "vision_tower.encoder.layer.43.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1359
  "vision_tower.encoder.layer.43.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
 
1369
  "vision_tower.encoder.layer.43.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1370
  "vision_tower.encoder.layer.43.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1371
  "vision_tower.encoder.layer.43.mlp.fc2.weight": "model-00003-of-00016.safetensors",
 
1372
  "vision_tower.encoder.layer.44.attention.k_norm.weight": "model-00003-of-00016.safetensors",
1373
  "vision_tower.encoder.layer.44.attention.k_proj.weight": "model-00003-of-00016.safetensors",
1374
  "vision_tower.encoder.layer.44.attention.projection_layer.bias": "model-00003-of-00016.safetensors",
 
1384
  "vision_tower.encoder.layer.44.mlp.fc1.weight": "model-00003-of-00016.safetensors",
1385
  "vision_tower.encoder.layer.44.mlp.fc2.bias": "model-00003-of-00016.safetensors",
1386
  "vision_tower.encoder.layer.44.mlp.fc2.weight": "model-00003-of-00016.safetensors",
 
1387
  "vision_tower.encoder.layer.5.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1388
  "vision_tower.encoder.layer.5.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1389
  "vision_tower.encoder.layer.5.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1399
  "vision_tower.encoder.layer.5.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1400
  "vision_tower.encoder.layer.5.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1401
  "vision_tower.encoder.layer.5.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
1402
  "vision_tower.encoder.layer.6.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1403
  "vision_tower.encoder.layer.6.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1404
  "vision_tower.encoder.layer.6.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1414
  "vision_tower.encoder.layer.6.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1415
  "vision_tower.encoder.layer.6.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1416
  "vision_tower.encoder.layer.6.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
1417
  "vision_tower.encoder.layer.7.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1418
  "vision_tower.encoder.layer.7.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1419
  "vision_tower.encoder.layer.7.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1429
  "vision_tower.encoder.layer.7.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1430
  "vision_tower.encoder.layer.7.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1431
  "vision_tower.encoder.layer.7.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
1432
  "vision_tower.encoder.layer.8.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1433
  "vision_tower.encoder.layer.8.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1434
  "vision_tower.encoder.layer.8.attention.projection_layer.bias": "model-00001-of-00016.safetensors",
 
1444
  "vision_tower.encoder.layer.8.mlp.fc1.weight": "model-00001-of-00016.safetensors",
1445
  "vision_tower.encoder.layer.8.mlp.fc2.bias": "model-00001-of-00016.safetensors",
1446
  "vision_tower.encoder.layer.8.mlp.fc2.weight": "model-00001-of-00016.safetensors",
 
1447
  "vision_tower.encoder.layer.9.attention.k_norm.weight": "model-00001-of-00016.safetensors",
1448
  "vision_tower.encoder.layer.9.attention.k_proj.weight": "model-00001-of-00016.safetensors",
1449
  "vision_tower.encoder.layer.9.attention.projection_layer.bias": "model-00001-of-00016.safetensors",