UncleFish commited on May 7

Commit

c2becfa

•

1 Parent(s): bc78887

init release the instruct version

Browse files

Files changed (26) hide show

.gitignore +3 -0
LICENSE.txt +437 -0
README.md +84 -0
added_tokens.json +14 -0
config.json +26 -0
configuration_blip_3.py +161 -0
demo.ipynb +0 -0
generation_config.json +7 -0
image_processing_blip_3.py +409 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +673 -0
modeling_blip_3.py +110 -0
preprocessor_config.json +23 -0
special_tokens_map.json +30 -0
test_samples/images/112.jpg +0 -0
test_samples/images/51.jpg +0 -0
test_samples/images/76.jpg +0 -0
test_samples/test.json +20 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +137 -0
utils.py +383 -0
vlm.py +1531 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/**
+debug.py
+sanity_check.ipynb

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+---
+license: cc-by-nc-4.0
+language:
+- en
+pipeline_tag: image-text-to-text
+---
+# Model description
+BLIP-3 consists of 3 models: a CLIP-like image encoder, a VL connector, and a large language model.
+# Direct Use and Downstream Use
+# Bias, Risks, Limitations, and Ethical Considerations
+# How to use
+```python
+from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
+import torch
+import requests
+from PIL import Image
+# define the prompt template
+def apply_prompt_template(prompt):
+    s = (
+            '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
+            "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
+            f'<|user|>\n<image>\n{prompt}<|end|>\n<|assistant|>\n'
+        )
+    return s
+class EosListStoppingCriteria(StoppingCriteria):
+    def __init__(self, eos_sequence = [32007]):
+        self.eos_sequence = eos_sequence
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
+        return self.eos_sequence in last_ids
+# load models
+model_name_or_path = "Salesforce/blip3-phi2-r"
+model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("./", trust_remote_code=True, use_fast=True, legacy=False)
+image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+tokenizer = model.update_special_tokens(tokenizer)
+# craft a test sample
+img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+query = "how many dogs are in the picture?"
+model = model.cuda()
+inputs = image_processor([raw_image], return_tensors="pt", image_aspect_ratio='anyres')
+prompt = apply_prompt_template(query)
+language_inputs = tokenizer([prompt], return_tensors="pt")
+inputs.update(language_inputs)
+inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
+generated_text = model.generate(**inputs, image_size=[img.size],
+                                pad_token_id=tokenizer.pad_token_id,
+                                do_sample=False, max_new_tokens=768, top_p=None, num_beams=1,
+                                stopping_criteria = [EosListStoppingCriteria()],
+                                )
+prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True)
+print("==> prediciton: ", prediction)
+# output: ==> prediciton: There is one dog in the picture.
+```
+# License
+Our code and weights are released under the Creative Commons Attribution Non Commercial 4.0 [LICENSE](LICENSE.txt).
+# Troubleshoot
+1. If you missing any packages, please consider the followings
+```
+pip install -U "transformers==4.40.0" --upgrade
+pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
+pip install open_clip_torch==2.24.0
+pip install einops
+pip install einops-exts
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "<pad>": 32011,
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "Blip3ModelForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_blip_3.Blip3Config",
+    "AutoModelForVision2Seq": "modeling_blip_3.Blip3ModelForConditionalGeneration"
+  },
+  "model_type": "blip_3",
+  "text_config": {
+    "initial_tokenizer_len": 32012,
+    "model_type": "phi3",
+    "sliding_window": 2047,
+    "torch_dtype": "bfloat16"
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0.dev0",
+  "vision_encoder_config": {
+    "anyres_patch_sampling": true,
+    "image_aspect_ratio": "anyres",
+    "model_type": "blip_3_vision_encoder"
+  },
+  "vision_tokenizer_config": {
+    "model_type": "blip_3_vision_tokenizer"
+  }
+}

configuration_blip_3.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from transformers import PretrainedConfig
+from transformers import logging
+from transformers import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+class Blip3VisionEncoderConfig(PretrainedConfig):
+    model_type = "blip_3_vision_encoder"
+    def __init__(self,
+                 model_name: str = 'ViT-H-14-378-quickgelu',
+                 force_image_size: int = 378,
+                 **kwargs):
+        self.model_name = model_name
+        self.force_image_size = force_image_size
+        super().__init__(**kwargs)
+class Blip3VisionTokenizerConfig(PretrainedConfig):
+    model_type = "blip_3_vision_tokenizer"
+    def __init__(self,
+                 vis_feature_dim: int = 1280,
+                 lang_embedding_dim: int = 3072,
+                 num_vis_tokens: int = 128,
+                 image_aspect_ratio: str = 'anyres',
+                 repeat_latents: bool = False,
+                **kwargs):
+        self.vis_feature_dim = vis_feature_dim
+        self.lang_embedding_dim = lang_embedding_dim
+        self.num_vis_tokens = num_vis_tokens
+        self.image_aspect_ratio = image_aspect_ratio
+        self.repeat_latents = repeat_latents
+        super().__init__(**kwargs)
+class Blip3Config(PretrainedConfig):
+    model_type = "blip_3"
+    def __init__(self,
+                 vision_encoder_config: dict = None,
+                 vision_tokenizer_config: dict = None,
+                 text_config: dict = None,
+                 **kwargs):
+        if vision_encoder_config is None:
+            vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
+            logger.info("vision_encoder_config is None. initializing the Blip3VisionEncoderConfig with default values.")
+        if vision_tokenizer_config is None:
+            vision_tokenizer_config = {}
+            logger.info("vision_tokenizer_config is None. Initializing the Blip3VisionTokenizerConfig with default values.")
+        if text_config is None:
+            text_config = {
+                'initial_tokenizer_len':32012,
+                'pad_token_id':32011,
+                'bos_token_id':1,
+                'eos_token_id':32000,
+                'vocab_size': 32064,
+                'hidden_size': 3072,
+                'intermediate_size': 8192,
+                'num_hidden_layers': 32,
+                'num_attention_heads': 32,
+                'num_key_value_heads': 32,
+                'resid_pdrop': 0.0,
+                'embd_pdrop': 0.0,
+                'attention_dropout': 0.0,
+                'hidden_act': 'silu',
+                'max_position_embeddings': 4096,
+                'original_max_position_embeddings': 4096,
+                'initializer_range': 0.02,
+                'rms_norm_eps': 1e-05,
+                'use_cache': True,
+                'rope_theta': 10000.0,
+                'rope_scaling': None,
+                'sliding_window': 2047,
+                'return_dict': True,
+                'output_hidden_states': False,
+                'output_attentions': False,
+                'torchscript': False,
+                'torch_dtype': 'bfloat16',
+                'use_bfloat16': False,
+                'tf_legacy_loss': False,
+                'pruned_heads': {},
+                'tie_word_embeddings': False,
+                'chunk_size_feed_forward': 0,
+                'is_encoder_decoder': False,
+                'is_decoder': False,
+                'cross_attention_hidden_size': None,
+                'add_cross_attention': False,
+                'tie_encoder_decoder': False,
+                'max_length': 20,
+                'min_length': 0,
+                'do_sample': False,
+                'early_stopping': False,
+                'num_beams': 1,
+                'num_beam_groups': 1,
+                'diversity_penalty': 0.0,
+                'temperature': 1.0,
+                'top_k': 50,
+                'top_p': 1.0,
+                'typical_p': 1.0,
+                'repetition_penalty': 1.0,
+                'length_penalty': 1.0,
+                'no_repeat_ngram_size': 0,
+                'encoder_no_repeat_ngram_size': 0,
+                'bad_words_ids': None,
+                'num_return_sequences': 1,
+                'output_scores': False,
+                'return_dict_in_generate': False,
+                'forced_bos_token_id': None,
+                'forced_eos_token_id': None,
+                'remove_invalid_values': False,
+                'exponential_decay_length_penalty': None,
+                'suppress_tokens': None,
+                'begin_suppress_tokens': None,
+                'finetuning_task': None,
+                'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},
+                'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
+                'tokenizer_class': None,
+                'prefix': None,
+                'bos_token_id': 1,
+                'pad_token_id': 32000,
+                'eos_token_id': 32000,
+                'sep_token_id': None,
+                'decoder_start_token_id': None,
+                'task_specific_params': None,
+                'problem_type': None,
+                'model_type': 'phi3'
+                }
+            logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
+        self.vision_encoder_config = Blip3VisionEncoderConfig(**vision_encoder_config)
+        self.vision_tokenizer_config = Blip3VisionTokenizerConfig(**vision_tokenizer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        for key in ['initial_tokenizer_len', 'pad_token_id']:
+            if key not in self.text_config.to_dict():
+                raise ValueError(f"The key `{key}` is missing in the text_config.")
+        super().__init__(**kwargs)
+    @classmethod
+    def from_vision_encoder_vision_tokenizer_text_configs(
+        cls,
+        vision_encoder_config: Blip3VisionEncoderConfig,
+        vision_tokenizer_config: Blip3VisionTokenizerConfig,
+        text_config: PretrainedConfig,
+        **kwargs):
+        return cls(
+            vision_encoder_config=vision_encoder_config.to_dict(),
+            vision_tokenizer_config=vision_tokenizer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )

demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "pad_token_id": 32000,
+  "transformers_version": "4.41.0.dev0"
+}

image_processing_blip_3.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import random
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import torchvision.transforms.functional as F
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
+    CenterCrop, ColorJitter, Grayscale
+import numbers
+import torch
+import ast
+import math
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.utils import TensorType
+class Blip3ImageProcessor(BaseImageProcessor):
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resize_mode: str = "squash",
+        interpolation_mode: str = "bicubic",
+        size: Union[Tuple[int, int], List[int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resize_mode = resize_mode
+        self.interpolation_mode = interpolation_mode
+        self.size = size if size is not None else (378, 378)
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+    @classmethod
+    def resize(cls, image_size, resize_mode, interpolation='bicubic', fill_color=0):
+        interpolation_mode = InterpolationMode.BILINEAR if interpolation == 'bilinear' else InterpolationMode.BICUBIC
+        if resize_mode == 'longest':
+            transforms = [
+                ResizeKeepRatio(image_size, interpolation=interpolation_mode, longest=1),
+                CenterCropOrPad(image_size, fill=fill_color)
+            ]
+        elif resize_mode == 'squash':
+            if isinstance(image_size, int):
+                image_size = (image_size, image_size)
+            transforms = [
+                Resize(image_size, interpolation=interpolation_mode),
+            ]
+        else:
+            assert resize_mode == 'shortest'
+            if not isinstance(image_size, (tuple, list)):
+                image_size = (image_size, image_size)
+            if image_size[0] == image_size[1]:
+                # simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
+                transforms = [
+                    Resize(image_size[0], interpolation=interpolation_mode)
+                ]
+            else:
+                # resize shortest edge to matching target dim for non-square target
+                transforms = [ResizeKeepRatio(image_size)]
+            transforms += [CenterCrop(image_size)]
+        return transforms
+    @classmethod
+    def convert_rgb(cls, image):
+        return image.convert("RGB")
+    def _preprocess(self,
+                   images: ImageInput
+                   ) -> torch.Tensor:
+        transforms = self.resize(self.size,  self.resize_mode, self.interpolation_mode)
+        transforms.extend([
+            self.convert_rgb,
+            ToTensor(),
+            Normalize(mean=self.image_mean, std=self.image_std)
+        ])
+        composed_transforms = Compose(transforms)
+        images_tensor = composed_transforms(images)
+        return images_tensor
+    def preprocess(self,
+                   images: ImageInput,
+                   return_tensors: Optional[Union[str, TensorType]] = None,
+                   **kwargs) -> BatchFeature:
+        if 'image_aspect_ratio' in kwargs:
+            image_aspect_ratio = kwargs['image_aspect_ratio']
+        else:
+            image_aspect_ratio = 'pad'
+        new_images = []
+        if image_aspect_ratio == 'pad':
+            for image in images:
+                image = self._preprocess(image)
+                new_images.append(image)
+        else:
+            if isinstance(self.size, (tuple, list)):
+                base_img_size = self.size[0]
+            else:
+                raise ValueError("size should be list or tuple")
+            for image in images:
+                image = process_anyres_image(image, self._preprocess, self.size,
+                                             [
+                                                [base_img_size,base_img_size*2],
+                                                [base_img_size*2,base_img_size],
+                                                [base_img_size*2,base_img_size*2],
+                                                [base_img_size*3,base_img_size],
+                                                [base_img_size,base_img_size*3]
+                                            ])
+                new_images.append(image)
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        if image_aspect_ratio == 'pad':
+            new_images = BatchFeature(data={"pixel_values": new_images.unsqueeze(0).unsqueeze(0)}, tensor_type=return_tensors)
+        else:
+            new_images = BatchFeature(data={"pixel_values": new_images.unsqueeze(0)}, tensor_type=return_tensors)
+        return new_images
+    # def preprocess(self,
+    #                images: ImageInput,
+    #                return_tensors: Optional[Union[str, TensorType]] = None,
+    #                **kwargs) -> BatchFeature:
+    #     transforms = self.resize(self.size,  self.resize_mode, self.interpolation_mode)
+    #     transforms.extend([
+    #         self.convert_rgb,
+    #         ToTensor(),
+    #         Normalize(mean=self.image_mean, std=self.image_std)
+    #     ])
+    #     composed_transforms = Compose(transforms)
+    #     images_tensor = composed_transforms(images).unsqueeze(0).unsqueeze(1).unsqueeze(0)
+    #     encoded_outputs = BatchFeature(data={"pixel_values": images_tensor}, tensor_type=return_tensors)
+    #     return encoded_outputs
+class ResizeKeepRatio:
+    """ Resize and Keep Ratio
+    Copy & paste from `timm`
+    """
+    def __init__(
+            self,
+            size,
+            longest=0.,
+            interpolation=InterpolationMode.BICUBIC,
+            random_scale_prob=0.,
+            random_scale_range=(0.85, 1.05),
+            random_aspect_prob=0.,
+            random_aspect_range=(0.9, 1.11)
+    ):
+        if isinstance(size, (list, tuple)):
+            self.size = tuple(size)
+        else:
+            self.size = (size, size)
+        self.interpolation = interpolation
+        self.longest = float(longest)  # [0, 1] where 0 == shortest edge, 1 == longest
+        self.random_scale_prob = random_scale_prob
+        self.random_scale_range = random_scale_range
+        self.random_aspect_prob = random_aspect_prob
+        self.random_aspect_range = random_aspect_range
+    @staticmethod
+    def get_params(
+            img,
+            target_size,
+            longest,
+            random_scale_prob=0.,
+            random_scale_range=(0.85, 1.05),
+            random_aspect_prob=0.,
+            random_aspect_range=(0.9, 1.11)
+    ):
+        """Get parameters
+        """
+        source_size = img.size[::-1]  # h, w
+        h, w = source_size
+        target_h, target_w = target_size
+        ratio_h = h / target_h
+        ratio_w = w / target_w
+        ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (1. - longest)
+        if random_scale_prob > 0 and random.random() < random_scale_prob:
+            ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1])
+            ratio_factor = (ratio_factor, ratio_factor)
+        else:
+            ratio_factor = (1., 1.)
+        if random_aspect_prob > 0 and random.random() < random_aspect_prob:
+            aspect_factor = random.uniform(random_aspect_range[0], random_aspect_range[1])
+            ratio_factor = (ratio_factor[0] / aspect_factor, ratio_factor[1] * aspect_factor)
+        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
+        return size
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size
+        """
+        size = self.get_params(
+            img, self.size, self.longest,
+            self.random_scale_prob, self.random_scale_range,
+            self.random_aspect_prob, self.random_aspect_range
+        )
+        img = F.resize(img, size, self.interpolation)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += f', interpolation={self.interpolation})'
+        format_string += f', longest={self.longest:.3f})'
+        return format_string
+def _setup_size(size, error_msg):
+    if isinstance(size, numbers.Number):
+        return int(size), int(size)
+    if isinstance(size, Sequence) and len(size) == 1:
+        return size[0], size[0]
+    if len(size) != 2:
+        raise ValueError(error_msg)
+    return size
+def center_crop_or_pad(img: torch.Tensor, output_size: List[int], fill=0) -> torch.Tensor:
+    """Center crops and/or pads the given image.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int,
+            it is used for both directions.
+        fill (int, Tuple[int]): Padding color
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    _, image_height, image_width = F.get_dimensions(img)
+    crop_height, crop_width = output_size
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        img = F.pad(img, padding_ltrb, fill=fill)
+        _, image_height, image_width = F.get_dimensions(img)
+        if crop_width == image_width and crop_height == image_height:
+            return img
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return F.crop(img, crop_top, crop_left, crop_height, crop_width)
+class CenterCropOrPad(torch.nn.Module):
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.fill = fill
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        return center_crop_or_pad(img, self.size, fill=self.fill)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+def process_anyres_image(image, processor, processor_size, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        processor_size (tuple, list): The size of the image processor.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # FIXME: determine grid_pinpoints from image sizes.
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    # processor_size = processor.transforms[0].size
+    patches = divide_to_patches(image_padded, processor_size[0])
+    image_original_resize = image.resize((processor_size[0], processor_size[0]))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor(image_patch)
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e6acb1fba540ae862c8ec5a3f0898a25f97df07029235aa164be537e13e664b
+size 4977054880

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edf7152efe2adbf112f09395c1a443d53b439cba0342285dae7ab6225281dcaa
+size 4983112128

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1afa3c02b194f2a1cd36f296ef7690c42f356b9b48e98644011b59336b0699a
+size 4983112168

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b69d6172f961180def49586fe73b71c2bd2e4ba968564f276486e86030a1da36
+size 3414256548

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,673 @@

+{
+  "metadata": {
+    "total_size": 18357448972
+  },
+  "weight_map": {
+    "vlm.lang_model.lm_head.additional_fc.bias": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.lm_head.additional_fc.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.lm_head.bias": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.lm_head.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.embed_tokens.additional_embedding.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.10.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.11.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.12.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.13.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.14.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.15.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.16.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.17.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.18.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.19.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.20.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.21.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.22.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.23.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.24.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.25.self_attn.qkv_proj.weight": "model-00003-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.26.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.27.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.28.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.29.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.30.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.31.self_attn.qkv_proj.weight": "model-00004-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.4.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.5.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.6.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.7.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.8.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.layers.9.self_attn.qkv_proj.weight": "model-00002-of-00004.safetensors",
+    "vlm.lang_model.model.norm.weight": "model-00004-of-00004.safetensors",
+    "vlm.vision_encoder.class_embedding": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.conv1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_post.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_post.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_pre.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.ln_pre.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.positional_embedding": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.proj": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.0.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.1.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.10.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.11.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.12.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.13.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.14.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.15.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.16.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.17.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.18.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.19.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.2.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.20.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.21.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.22.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.23.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.24.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.25.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.26.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.27.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.28.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.29.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.3.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.30.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.31.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.4.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.5.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.6.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.7.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.8.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.in_proj_bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.in_proj_weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_1.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.ln_2.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_fc.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_fc.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_proj.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_encoder.transformer.resblocks.9.mlp.c_proj.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.latents": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.0.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.1.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.2.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.3.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.4.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_latents.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_latents.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_media.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.norm_media.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.to_kv.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.to_out.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.0.to_q.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.1.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.layers.5.1.3.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.norm.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.norm.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.projection.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.projection.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.0.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.0.weight": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.2.bias": "model-00001-of-00004.safetensors",
+    "vlm.vision_tokenizer.text_projection.2.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling_blip_3.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from transformers import PreTrainedModel, AutoModelForCausalLM
+import torch
+import open_clip
+from typing import List, Optional, Tuple, Union
+from .utils import check_embedding_fns
+from .vlm import InstructPerceiverResampler, KosmosInstruct
+from .configuration_blip_3 import Blip3VisionEncoderConfig, Blip3VisionTokenizerConfig, Blip3Config
+class Blip3VisionEncoder(PreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip3VisionEncoderConfig
+    def __init__(self, config: Blip3VisionEncoderConfig):
+        super().__init__(config)
+        if config.model_name != 'ViT-H-14-378-quickgelu':
+            raise ValueError(f"Unsupported model {config.model_name}. New vision models will be added soon.")
+        self.model, _, _ = open_clip.create_model_and_transforms(
+            model_name = config.model_name,
+            force_image_size=config.force_image_size
+            )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        # assert pixel_values.ndim == 4, f"Expected 4D tensor (bs, c, h, w), got {pixel_values.ndim}"
+        return self.model.encode_image(pixel_values)
+# vision tokenizer
+class Blip3VisionTokenizer(PreTrainedModel):
+    config_class = Blip3VisionTokenizerConfig
+    def __init__(self, config: Blip3VisionTokenizerConfig):
+        super().__init__(config)
+        self.model = InstructPerceiverResampler(
+            dim_llm=config.lang_embedding_dim,
+            dim=config.vis_feature_dim,
+            dim_inner=config.lang_embedding_dim,
+            num_latents=config.num_vis_tokens,
+            repeat_latents=config.repeat_latents
+        )
+    def forward(self,
+                vision_features: torch.Tensor,
+                vision_attn_masks: torch.Tensor):
+        return self.model(vision_features, vision_attn_masks)
+# Blip3 model
+class Blip3ModelForConditionalGeneration(PreTrainedModel):
+    config_class = Blip3Config
+    def __init__(self, config: Blip3Config):
+        super().__init__(config)
+        # vision encoder initialization
+        vision_encoder = Blip3VisionEncoder(config.vision_encoder_config).model
+        vision_encoder.visual.output_tokens = True
+        vision_encoder = vision_encoder.visual
+        # language model initialization
+        language_model = AutoModelForCausalLM.from_config(config.text_config)
+        check_embedding_fns(language_model)
+        # Update _tied_weights_keys using the base model used.
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        # vision tokenizer initialization
+        if config.vision_tokenizer_config.lang_embedding_dim != language_model.get_input_embeddings().weight.shape[1]:
+            overwrite = language_model.get_input_embeddings().weight.shape[1]
+            config.vision_tokenizer_config.lang_embedding_dim = overwrite
+            print(f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}.")
+        vision_tokenizer = Blip3VisionTokenizer(config.vision_tokenizer_config).model
+        self.vlm = KosmosInstruct(
+            vision_encoder=vision_encoder,
+            vision_tokenizer=vision_tokenizer,
+            lang_model=language_model,
+            initial_tokenizer_len = config.text_config.initial_tokenizer_len,
+            pad_token_id = config.text_config.pad_token_id,
+            image_aspect_ratio = config.vision_encoder_config.image_aspect_ratio,
+            anyres_patch_sampling = config.vision_encoder_config.anyres_patch_sampling
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generate_kwargs,
+        ) -> torch.LongTensor:
+        self.vlm = self.vlm.eval()
+        return self.vlm.generate(
+            vision_x = pixel_values,
+            lang_x = input_ids,
+            attention_mask = attention_mask,
+            **generate_kwargs)
+    def update_special_tokens(self, tokenizer):
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": list(self.vlm.special_tokens.values())}
+        )
+        self.vlm.lang_model.config.vocab_size = len(tokenizer)
+        self.vlm.set_special_token_ids(
+            {
+                v: tokenizer.convert_tokens_to_ids(v) for v in self.vlm.special_tokens.values()
+            }
+        )
+        return tokenizer

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_blip_3.Blip3ImageProcessor"
+  },
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Blip3ImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "interpolation_mode": "bicubic",
+  "resize_mode": "squash",
+  "size": [
+    378,
+    378
+  ]
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_samples/images/112.jpg ADDED Viewed

test_samples/images/51.jpg ADDED Viewed

test_samples/images/76.jpg ADDED Viewed

test_samples/test.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+    {
+        "question": "Which room is bigger, the double garage or the living room?",
+        "answer": "double garage",
+        "category": "ocr,spat,math",
+        "image_path": "./test_samples/images/51.jpg"
+    },
+    {
+        "question": "what is the green logo on the car?",
+        "answer": "monster",
+        "category": "rec",
+        "image_path": "./test_samples/images/76.jpg"
+    },
+    {
+        "question": "Is there any reflection of zebra in water?",
+        "answer": "yes",
+        "category": "rec",
+        "image_path": "./test_samples/images/112.jpg"
+    }
+]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 4096,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

utils.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import torch
+import ast
+import math
+from PIL import Image
+def has_fn(model, fn_name):
+    """Check if model has a function fn_name"""
+    return callable(getattr(model, fn_name, None))
+def exists(val):
+    return val is not None
+def num_params(module, filter_to_trainable=False):
+    """Returns the number of parameters in the module, or optionally only the trainable parameters"""
+    if filter_to_trainable:
+        return sum(p.numel() for p in module.parameters() if p.requires_grad)
+    else:
+        return sum(p.numel() for p in module.parameters())
+def hasattr_recursive(obj, att):
+    """
+    Check if obj has nested attribute
+    Example: hasattr_recursive(obj, 'a.b.c') is equivalent to hasattr(obj, 'a') and hasattr(obj.a, 'b') and hasattr(obj.a.b, 'c')
+    """
+    if att == "":
+        return True
+    i = att.find(".")
+    if i < 0:
+        return hasattr(obj, att)
+    else:
+        try:
+            return hasattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+        except:
+            return False
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def stack_with_padding(list_of_tensors, padding_value=0, padding_side="right"):
+    """
+    Stack a list of tensors with padding on one side
+    Args:
+        list_of_tensors (list[torch.Tensor]): List of tensors to stack
+        padding_value (int, optional): Value to pad with. Defaults to 0.
+        padding_side (str, optional): Side to pad on. Defaults to "right".
+    Returns:
+        torch.Tensor: Stacked tensors
+    """
+    max_tokens = max(tensor.size(0) for tensor in list_of_tensors)
+    padded_tensors = []
+    for tensor in list_of_tensors:
+        num_tokens = tensor.size(0)
+        if len(tensor.size()) == 1:
+            padding = torch.full(
+                (max_tokens - num_tokens,),
+                padding_value,
+                dtype=tensor.dtype,
+                device=tensor.device,
+            )
+        else:
+            padding = torch.full(
+                (max_tokens - num_tokens, tensor.size(1)),
+                padding_value,
+                dtype=tensor.dtype,
+                device=tensor.device,
+            )
+        padded_tensor = (
+            torch.cat((tensor, padding), dim=0)
+            if padding_side == "right"
+            else torch.cat((padding, tensor), dim=0)
+        )
+        padded_tensors.append(padded_tensor)
+    return torch.stack(padded_tensors)
+def check_embedding_fns(lang_model):
+    """Checks for and attempts to set {get/set}_{input/output}_embeddings functions to the model"""
+    if not has_fn(lang_model, "get_input_embeddings"):
+        if hasattr_recursive(lang_model, "transformer.wte"):  # MPT
+            lang_model.get_input_embeddings = lambda: lang_model.transformer.wte
+        elif hasattr_recursive(lang_model, "model.decoder.embed_tokens"):  # OPT
+            lang_model.get_input_embeddings = lambda: lang_model.decoder.embed_tokens
+        else:
+            raise ValueError(
+                "We require the language encoder to have a get_input_embeddings method but we couldn't determine the name of the input embeddings attribute. Please supply this manually in factory.py."
+            )
+    if not has_fn(lang_model, "set_input_embeddings"):
+        if hasattr_recursive(lang_model, "transformer.wte"):  # MPT
+            lang_model.set_input_embeddings = lambda x: setattr_recursive(
+                lang_model, "transformer.wte", x
+            )
+        elif hasattr_recursive(lang_model, "model.decoder.embed_tokens"):  # OPT
+            lang_model.set_input_embeddings = lambda x: setattr_recursive(
+                lang_model, "model.decoder.embed_tokens", x
+            )
+        else:
+            raise ValueError(
+                "We require the language encoder to have a set_input_embeddings method but we couldn't determine the name of the input embeddings attribute. Please supply this manually in factory.py."
+            )
+    if not has_fn(lang_model, "get_output_embeddings"):
+        if hasattr_recursive(lang_model, "lm_head"):
+            lang_model.get_output_embeddings = lambda: lang_model.lm_head
+        else:
+            raise ValueError(
+                "We require the language encoder to have a get_output_embeddings method but we couldn't determine the name of the output embeddings attribute. Please supply this manually in factory.py."
+            )
+    if not has_fn(lang_model, "set_output_embeddings"):
+        if hasattr_recursive(lang_model, "lm_head"):
+            lang_model.set_output_embeddings = lambda x: setattr_recursive(
+                lang_model, "lm_head", x
+            )
+        else:
+            raise ValueError(
+                "We require the language encoder to have a set_output_embeddings method but we couldn't determine the name of the output embeddings attribute. Please supply this manually in factory.py."
+            )
+def has_fn(model, fn_name):
+    """Check if model has a function fn_name"""
+    return callable(getattr(model, fn_name, None))
+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+def unpad_image(tensor, original_size, keep_original_shape=False):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        if keep_original_shape:
+            attention_mask = torch.ones((current_height, current_width), device=tensor.device)
+            attention_mask[:padding, :] = 0
+            attention_mask[current_height - padding:, :] = 0
+            return tensor, attention_mask
+        else:
+            unpadded_tensor = tensor[:, padding:current_height - padding, :]
+            return unpadded_tensor, None
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        if keep_original_shape:
+            attention_mask = torch.ones((current_height, current_width), device=tensor.device)
+            attention_mask[:, :padding] = 0
+            attention_mask[:, current_width - padding:] = 0
+            return tensor, attention_mask
+        else:
+            unpadded_tensor = tensor[:, :, padding:current_width - padding]
+            return unpadded_tensor, None
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # FIXME: determine grid_pinpoints from image sizes.
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    processor_size = processor.transforms[0].size
+    patches = divide_to_patches(image_padded, processor_size[0])
+    image_original_resize = image.resize((processor_size[0], processor_size[0]))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor(image_patch)
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.transforms[-1].mean))
+            image = image_processor(image)
+            new_images.append(image)
+    elif image_aspect_ratio in ["anyres", "anyres-legacy"]:
+        base_img_size = image_processor.transforms[0].size[0]
+        for image in images:
+            image = process_anyres_image(image, image_processor, [[base_img_size,base_img_size*2],
+                                                                  [base_img_size*2,base_img_size],
+                                                                  [base_img_size*2,base_img_size*2],
+                                                                  [base_img_size*3,base_img_size],
+                                                                  [base_img_size,base_img_size*3]])
+            # Debug any res inference by only using 672x672.
+            # image = process_anyres_image(image, image_processor, [[base_img_size*2,base_img_size*2]])
+            new_images.append(image)
+    else:
+        return image_processor(images)
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images

vlm.py ADDED Viewed

	@@ -0,0 +1,1531 @@

+import torch
+from torch import einsum, nn
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from einops import rearrange
+from typing import List, Optional, Tuple, Union
+import torch.nn.functional as F
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from dataclasses import dataclass
+from transformers import CLIPVisionModel
+import transformers
+from .utils import num_params, getattr_recursive, stack_with_padding, get_anyres_image_grid_shape, unpad_image
+class VisionTokenizer(nn.Module):
+    def __init__(self, dim_media, num_tokens_per_media):
+        super().__init__()
+        self.dim_media = dim_media
+        self.num_tokens_per_media = num_tokens_per_media
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents, vision_attn_masks=None):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+        h = self.heads
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2) # TODO: Change the shape of vision attention mask according to this.
+        if vision_attn_masks is not None:
+            vision_attn_masks = torch.cat((vision_attn_masks,
+                                            torch.ones((latents.shape[0], latents.shape[-2]), dtype=latents.dtype, device=latents.device)),
+                                            dim=-1)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        # Apply vision attention mask here.
+        # Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention
+        if vision_attn_masks is not None:
+            attn_bias = torch.zeros((q.size(0), 1, 1, q.size(-2), k.size(-2)), dtype=q.dtype, device=q.device)
+            vision_attn_masks = repeat(vision_attn_masks, 'b n -> b 1 1 l n', l=q.size(-2))
+            attn_bias.masked_fill_(vision_attn_masks.logical_not(), float("-inf"))
+            sim += attn_bias
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class InstructPerceiverResampler(VisionTokenizer):
+    def __init__(
+        self,
+        *,
+        dim_llm,
+        dim,
+        dim_inner=None,
+        depth=6,
+        dim_head=96,
+        heads=16,
+        num_latents=64,
+        repeat_latents=False,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        """
+        Perceiver module which takes in image features and outputs image tokens.
+        Args:
+            dim (int): dimension of the incoming image features
+            dim_inner (int, optional): final dimension to project the incoming image features to;
+                also the final dimension of the outputted features. If None, no projection is used, and dim_inner = dim.
+            depth (int, optional): number of layers. Defaults to 6.
+            dim_head (int, optional): dimension of each head. Defaults to 64.
+            heads (int, optional): number of heads. Defaults to 8.
+            num_latents (int, optional): number of latent tokens to use in the Perceiver;
+                also corresponds to number of tokens per sequence to output. Defaults to 64.
+            max_num_media (int, optional): maximum number of media per sequence to input into the Perceiver
+                and keep positional embeddings for. If None, no positional embeddings are used.
+            max_num_frames (int, optional): maximum number of frames to input into the Perceiver
+                and keep positional embeddings for. If None, no positional embeddings are used.
+            ff_mult (int, optional): dimension multiplier for the feedforward network. Defaults to 4.
+        """
+        if dim_inner is not None:
+            projection = nn.Linear(dim, dim_inner)
+        else:
+            projection = None
+            dim_inner = dim
+        super().__init__(dim_media=dim, num_tokens_per_media=num_latents)
+        self.projection = projection
+        # Text embedding projection.
+        # self.text_projection = nn.Linear(dim_llm, dim)
+        modules = [nn.Linear(dim_llm, dim)]
+        for _ in range(1, 2):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(dim, dim))
+        self.text_projection = nn.Sequential(*modules)
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.repeat_latents = repeat_latents
+        # positional embeddings
+        self.frame_embs = (
+            nn.Parameter(torch.randn(max_num_frames, dim))
+            if exists(max_num_frames)
+            else None
+        )
+        self.media_time_embs = (
+            nn.Parameter(torch.randn(max_num_media, 1, dim))
+            if exists(max_num_media)
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(
+                            dim=dim, dim_head=dim_head, heads=heads
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = nn.LayerNorm(dim)
+    # TODO: write a new forward function that takes in text input and append them to the query tokens.
+    def forward(self, x, text_embeds=None):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(
+            x, "b T F v d -> b T (F v) d"
+        )  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+        # blocks
+        # FIXME: extending query tokens proportional to the vision sequence length. Hard-coded as dfn5b token_len=729.
+        if self.repeat_latents:
+            r = v // 729 # Repeat the query tokens for r times.
+            latents = repeat(self.latents, "n d -> (n repeat) d", repeat=r)
+        else:
+            latents = self.latents
+        latents = repeat(latents, "n d -> b T n d", b=b, T=T)
+        # latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+         # Append text embedding.
+        if exists(text_embeds):
+            text_embeds = self.text_projection(text_embeds)
+            text_embeds = text_embeds[:, None, :, :]
+            latents = torch.cat((latents, text_embeds), dim=2) # FIXME: check latents shape.
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        # Truncate latents to only keep query tokens.
+        if exists(text_embeds):
+            latents = latents[:, :, :self.latents.shape[0], :]
+        if exists(self.projection):
+            return self.projection(self.norm(latents))
+        else:
+            return self.norm(latents)
+class DecoupledEmbedding(nn.Embedding):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
+    then it will create `num_additional_embeddings` additional parameters that are always trained. If
+    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
+    """
+    def __init__(
+        self,
+        max_original_id: int,
+        num_additional_embeddings: int = 0,
+        _weight: torch.Tensor = None,
+        num_original_embeddings: int = None,
+        embedding_dim: int = None,
+        partially_freeze=True,
+        device=None,
+        dtype=None,
+        pad_token_id=None,
+    ) -> None:
+        """
+        Args:
+            max_original_id (`int`):
+                The largest token id that should be embedded using the regular embedding (regular `weight`).
+                This is usually len(tokenizer) - 1 before additional tokens are added.
+                Note that this may not equal self.weight.shape[0]
+            num_additional_embeddings (`int`):
+                Number of additional tokens to initialize an Embedding matrix for (`additional_weight`).
+            _weight (`torch.Tensor`, *optional*, defaults to `None`): The regular weight tensor.
+                If provided, this sets the `num_original_embeddings` and `embedding_dim` parameters.
+            num_original_embeddings (`int`):
+                self.weight.shape[0]
+            embedding_dim (`int`):
+                The size of each embedding vector
+            partially_freeze: (`bool`, *optional*, defaults to `True`):
+                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
+            padding_idx (`int`, *optional*):
+                The padding index (needs to be less than num_embeddings)
+        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
+        `max_norm` or `norm_type`. We are not supporting these.
+        """
+        # validate args
+        if pad_token_id is not None and pad_token_id > max_original_id:
+            raise ValueError(
+                f"pad_token_id must be <= max_original_id. Got {pad_token_id} and {max_original_id}."
+                + "If the original tokenizer does not have a pad_token_id, use pad_token_id=None."
+            )
+        if _weight is not None:
+            assert (num_original_embeddings is None) or (
+                _weight.shape[0] == num_original_embeddings
+            ), f"num_original_embeddings={num_original_embeddings} but _weight.shape[0]={_weight.shape[0]}"
+            assert (embedding_dim is None) or (
+                _weight.shape[1] == embedding_dim
+            ), f"embedding_dim={embedding_dim} but _weight.shape[1]={_weight.shape[1]}"
+            num_original_embeddings = _weight.shape[0]
+            embedding_dim = _weight.shape[1]
+        else:
+            assert (
+                num_original_embeddings is not None
+            ), "num_original_embeddings must be provided if _weight is not provided"
+            assert (
+                embedding_dim is not None
+            ), "embedding_dim must be provided if _weight is not provided"
+        super().__init__(
+            num_embeddings=num_original_embeddings,
+            embedding_dim=embedding_dim,
+            device=device,
+            dtype=dtype,
+            padding_idx=pad_token_id,
+            _weight=_weight,
+        )
+        self.max_original_id = max_original_id
+        self.padding_idx = pad_token_id
+        self.num_additional_embeddings = num_additional_embeddings
+        if self.num_additional_embeddings > 0:
+            self.additional_embedding = nn.Embedding(
+                num_embeddings=self.num_additional_embeddings,
+                embedding_dim=embedding_dim,
+                device=device,
+                dtype=dtype,
+            )
+        self.set_requires_grad(
+            require_regular_grad=not partially_freeze, require_additional_grad=True
+        )
+    def set_requires_grad(self, require_regular_grad, require_additional_grad):
+        """
+        Helper function to separately set the requires_grad flag for the regular weight and the additional weight.
+        """
+        self.weight.requires_grad_(require_regular_grad)
+        self.additional_embedding.requires_grad_(require_additional_grad)
+    def forward(self, input_ids):
+        """
+        we have 2 embeddings, with different indices - one pretrained self.weight and another
+        self.additional_embedding.weight that is being trained.
+        in order to make a lookup of the input ids, we:
+        1. find out the indices of the entries belonging to the 2nd embedding
+        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
+        embedding starts from 0 and not num_embeddings
+        3. perform the 2nd embedding lookup
+        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
+        5. perform the 1st embedding lookup
+        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
+        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
+        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
+        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
+        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
+        measure.
+        """
+        if self.num_additional_embeddings == 0:
+            return F.embedding(input_ids, self.weight)
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = input_ids.clone()
+        additional_vocab_indices = torch.where(input_ids > self.max_original_id)
+        input_ids_additional_vocab = input_ids[additional_vocab_indices]
+        additional_embeddings = self.additional_embedding(
+            input_ids_additional_vocab - self.max_original_id - 1
+        )
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids[additional_vocab_indices] = 0
+        full_vector = F.embedding(input_ids, self.weight)
+        # overwrite the records with high indices
+        full_vector[additional_vocab_indices] = additional_embeddings
+        return full_vector
+    def extra_repr(self) -> str:
+        return "num_original_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
+            self.max_original_id + 1,
+            self.num_additional_embeddings,
+            self.embedding_dim,
+            (not self.weight.requires_grad),
+        )
+class DecoupledLinear(nn.Linear):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `additional_out_features` > 0,
+    then it will create `additional_out_features * in_features` additional parameters that are always trained. If
+    `additional_out_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
+    """
+    def __init__(
+        self,
+        max_original_id: int,
+        additional_out_features: int = 0,
+        _weight: torch.Tensor = None,
+        _bias: torch.Tensor = None,
+        in_features: int = None,
+        original_out_features: int = None,
+        bias: bool = True,
+        partially_freeze: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        Args:
+            max_original_id (`int`): The largest token id that should be extracted from the regular weight.
+                This is usually len(tokenizer) - 1 before additional tokens are added.
+                Note that this may not equal original_out_features - 1
+            _weight: torch.Tensor, *optional*, defaults to `None`. The regular weight tensor.
+                If provided, this sets the `in_features` and `original_out_features` parameters.
+            _bias: torch.Tensor, *optional*, defaults to `None`. The regular bias tensor.
+            in_features: int. Input hidden size.
+            original_out_features: int. Original out_features of the language model's get_output_embeddings() function.
+            additional_out_features: int. Number of additional trainable dimensions.
+            bias: bool. Whether to include a bias term.
+            partially_freeze: bool, *optional*, defaults to `True`): If `True`, the regular `weight` will be frozen.
+        """
+        # argument validation
+        if _weight is not None:
+            assert (_weight.shape[0] == original_out_features) or (
+                original_out_features is None
+            ), f"original_out_features={original_out_features} but _weight.shape[0]={_weight.shape[0]}"
+            assert (_weight.shape[1] == in_features) or (
+                in_features is None
+            ), f"in_features={in_features} but _weight.shape[1]={_weight.shape[1]}"
+            in_features = _weight.shape[1]
+            original_out_features = _weight.shape[0]
+        else:
+            assert (
+                in_features is not None
+            ), "in_features must be provided if _weight is not provided"
+            assert (
+                original_out_features is not None
+            ), "original_out_features must be provided if _weight is not provided"
+        if _bias is not None:
+            assert bias is True, "bias must be True if _bias is provided"
+        # initialize original linear
+        super().__init__(
+            in_features,
+            original_out_features,
+            bias,
+            device,
+            dtype)
+        # set weight and bias manually
+        if _weight is not None:
+            self.weight = nn.Parameter(_weight)
+        if _bias is not None:
+            self.bias = nn.Parameter(_bias)
+        self.in_features = in_features
+        self.original_out_features = original_out_features
+        self.max_original_id = max_original_id
+        # initialize additional linear
+        self.additional_out_features = additional_out_features
+        self.has_bias = bias
+        if additional_out_features > 0:
+            self.additional_fc = nn.Linear(
+                in_features=in_features,
+                out_features=additional_out_features,
+                bias=self.has_bias,
+                device=device,
+                dtype=dtype,
+            )
+        self.set_requires_grad(
+            require_regular_grad=not partially_freeze, require_additional_grad=True
+        )
+    def set_requires_grad(self, require_regular_grad, require_additional_grad):
+        """
+        Helper function to separately set the requires_grad flag for the regular weight and the additional weight.
+        """
+        self.weight.requires_grad_(require_regular_grad)
+        if self.has_bias:
+            self.bias.requires_grad_(require_regular_grad)
+        self.additional_fc.requires_grad_(require_additional_grad)
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = F.linear(input, self.weight, self.bias)
+        output = output[..., : self.max_original_id + 1]
+        if self.additional_out_features > 0:
+            additional_features = F.linear(
+                input, self.additional_fc.weight, self.additional_fc.bias
+            )
+            output = torch.cat((output, additional_features), -1)
+        return output
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return "in_features={}, out_features={}, additional_out_features={}, bias={}, partially_freeze={}".format(
+            self.in_features,
+            self.max_original_id + 1,
+            self.additional_out_features,
+            self.bias is not None,
+            (not self.weight.requires_grad or not self.bias.requires_grad),
+        )
+class VLM(nn.Module):
+    """
+    Generic vision-language model (VLM) class.
+    A VLM consists of four components:
+        1. A vision encoder that extracts features from pixels, e.g. CLIP
+            input: (B, T_img, F, C, H, W)
+            output: (B, T_img, F, v, d)
+        2. A vision tokenizer that converts these features to visual token-like embeddings, e.g. Perceiver, or a linear projection head
+            input: (B, T_img, F, v, d)
+            output: (B, T_img, n, d)
+        3. A fusion method that allows the language model to attend to these tokens, e.g. cross-attention, or placing the tokens directly in the language model's input sequence
+        4. A language model
+    """
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_tokenizer: nn.Module,
+        lang_model: nn.Module,
+        initial_tokenizer_len: int,
+        pad_token_id: int,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): e.g. CLIP
+            vision_tokenizer (nn.Module): e.g. PerceiverResampler
+            lang_model (nn.Module): e.g. MPT
+            initial_tokenizer_len (int): size of the original tokenizer vocab
+            pad_token_id (int): id of the pad token
+            gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
+        """
+        super().__init__()
+        # save dimension information
+        self.lang_embedding_dim = lang_model.get_input_embeddings().weight.shape[1]
+        if hasattr(lang_model.config, "d_model"):
+            self.lang_hidden_dim = lang_model.config.d_model  # mpt uses d_model
+        else:
+            self.lang_hidden_dim = lang_model.config.hidden_size
+        self.vis_embedding_dim = vision_tokenizer.dim_media
+        self.num_tokens_per_vis = vision_tokenizer.num_tokens_per_media
+        # core components
+        self.vision_encoder = vision_encoder
+        self.vision_tokenizer = vision_tokenizer
+        self.lang_model = lang_model
+        # lm embeddings
+        self.pad_token_id = pad_token_id
+        self.initial_tokenizer_len = initial_tokenizer_len
+        input_embeds = DecoupledEmbedding(
+            max_original_id=initial_tokenizer_len - 1,
+            num_additional_embeddings=len(self.special_tokens),
+            _weight=self.lang_model.get_input_embeddings().weight,
+            pad_token_id=self.pad_token_id,
+        )
+        if hasattr(input_embeds, "additional_embedding"):
+            input_embeds.additional_embedding.weight.data.normal_(
+                mean=0.0,
+                std=self.lang_model.config.initializer_range
+                if hasattr(self.lang_model.config, "initializer_range")
+                else 0.02,
+            )
+        self.lang_model.set_input_embeddings(input_embeds)
+        out_embeds = DecoupledLinear(
+            max_original_id=initial_tokenizer_len - 1,
+            additional_out_features=len(self.special_tokens),
+            _weight=self.lang_model.get_output_embeddings().weight,
+            _bias=self.lang_model.get_output_embeddings().bias if hasattr(self.lang_model.get_output_embeddings(), "bias") else None,
+        )
+        if hasattr(out_embeds, "additional_fc"):
+            out_embeds.additional_fc.weight.data.normal_(
+                mean=0.0,
+                std=self.lang_model.config.initializer_range
+                if hasattr(self.lang_model.config, "initializer_range")
+                else 0.02,
+            )
+        self.lang_model.set_output_embeddings(out_embeds)
+        # gradient checkpointing
+        self.vision_tokenizer._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: Optional[torch.Tensor],
+        lang_x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            vision_x: Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+                only F = 1 is supported (single-frame videos)
+                if T_img > the number of media tokens in the corresponding input_ids (lang_x),
+                only the first number of media tokens in lang_x are used
+            lang_x: Language input ids, with media tokens denoting where
+                visual media should be inserted.
+                shape (B, T_txt)
+            attention_mask: Attention mask. Defaults to None.
+            labels: Labels. Defaults to None.
+                shape (B, T_txt)
+            past_key_values (Tuple[torch.Tensor]], optional): Past key value pairs for each of the T_txt previous tokens in the language model. Defaults to None.
+                list of length = number of decoder layers in the LM
+                exact implementation depends on LM, see Hugging Face docs
+            past_media_locations (torch.Tensor, optional): boolean mask denoting which of the previous T_txt tokens were media tokens. Defaults to None.
+                shape (B, T_txt)
+            past_vision_tokens (torch.Tensor, optional): Previous vision tokens. Defaults to None.
+            use_cache (Optional[bool], optional): Whether to use cache. Defaults to False.
+                If True, includes key_values, media_locations, and vision_tokens in the output.
+        """
+        assert not (past_vision_tokens is None) ^ (
+            past_media_locations is None
+        ), "past_vision_tokens and past_media_locations must both be None or both be not None"
+        # convert pixels to vision tokens
+        if vision_x is not None:
+            vision_features = self._encode_vision_x(vision_x=vision_x)
+            vision_tokens = self.vision_tokenizer(vision_features)
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            padding_side="right",
+            past_vision_tokens=past_vision_tokens,
+        )
+        output = self.lang_model(
+            **new_inputs,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        # postprocessing may be needed, e.g. to remove extra tokens from logits that were inserted into the language stream
+        # or to add the past_vision_tokens and past_media_locations to the output
+        output = self._postprocess_outputs_from_forward(
+            output=output,
+            lang_x=lang_x,
+            vision_tokens=vision_tokens,
+            use_cache=use_cache,
+            past_vision_tokens=past_vision_tokens,
+            past_media_locations=past_media_locations,
+        )
+        # postforward hooks
+        self._post_forward_hook()
+        return output
+    def _encode_vision_x_anyres(self, samples, device):
+        image_raw = samples["image"] # list of patch list in of shape [1, N_patch, C, H, W]
+        image_sizes = samples["image_size"]
+        # concate list of patches into one big patch for any res encoding.
+        images = [x.squeeze(0) for x in image_raw] # [N_patch, C, H, W]
+        image = torch.cat(images, dim=0) # [\sum{B}{N_patch_i}, C, H, W]
+        image = image.to(device)
+        with torch.no_grad():
+            if self.vision_encoder.__class__.__name__ == "TimmModel":
+                image_embeds = self.vision_encoder.trunk.forward_features(image)
+            elif self.vision_encoder.__class__.__name__ == 'CLIPVisionModel':
+                image_embeds = self.vision_encoder(image).last_hidden_state
+            else:
+                image_embeds = self.vision_encoder(image)[1]  # OpenCLIP returns tuples
+        if isinstance(self.vision_encoder, CLIPVisionModel):
+            base_img_size = self.vision_encoder.config.image_size
+        else:
+            base_img_size = self.vision_encoder.image_size[0]
+        if self.vision_encoder.__class__.__name__ == "TimmModel":
+            grid_size = self.vision_encoder.trunk.patch_embed.grid_size
+        elif self.vision_encoder.__class__.__name__ == 'CLIPVisionModel':
+            grid_size_base = self.vision_encoder.config.image_size // self.vision_encoder.config.patch_size
+            grid_size = (grid_size_base, grid_size_base)
+        else:
+            grid_size = self.vision_encoder.grid_size
+        height, width = grid_size
+        if not image_embeds.shape[1] == height * width:
+            assert image_embeds.shape[1] == height * width + 1 # For vision encoders that has [CLS] token.
+            image_embeds = image_embeds[:, 1:, :] # Drop the cls token for each patch.
+        n_vis_token_per_patch = image_embeds.shape[1]
+        # Split encoded patches and merge patch features
+        # 1. Get the raw sizes from samples, and split the image embeds [\sum_{B}(N_patch_i), N_tok(16*16), C]
+        split_sizes = [image.shape[0] for image in images]
+        image_embeds = torch.split(image_embeds, split_sizes, dim=0)
+        # 2. For each image (consist of a list of patches), merge the patches spatially (of shape [C, n_patch_height, n_patch_width])
+        new_image_embeds = []
+        patch_attn_masks = []
+        max_n_img_token = -1
+        for idx, patch_embeds in enumerate(image_embeds):
+            if patch_embeds.shape[0] > 1:
+                # 3. Flatten the patch features and get [C, n_patch_height * (n_patch_width+1)]
+                base_patch_embeds = patch_embeds[0] # TODO: prepend the CLS token for th base patch embeds (of the resized entire image).
+                patch_embeds = patch_embeds[1:]
+                assert height * width == base_patch_embeds.shape[0]
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[idx],
+                                                                                [[base_img_size,base_img_size*2],
+                                                                                 [base_img_size*2,base_img_size],
+                                                                                 [base_img_size*2,base_img_size*2],
+                                                                                 [base_img_size*3,base_img_size],
+                                                                                 [base_img_size,base_img_size*3]],
+                                                                                base_img_size) # Hardcoded grid_pinpoints.
+                patch_embeds = patch_embeds.view(num_patch_height, num_patch_width, height, width, -1)
+                patch_embeds = patch_embeds.permute(4, 0, 2, 1, 3).contiguous()
+                patch_embeds = patch_embeds.flatten(1, 2).flatten(2, 3)
+                # TODO: add an option that return masked patch_embeds instead of trimmed.
+                patch_embeds, patch_attn_mask = unpad_image(patch_embeds, image_sizes[idx], self.anyres_patch_sampling)
+                if hasattr(self, 'image_newline'):
+                    patch_embeds = torch.cat((
+                            patch_embeds,
+                            self.image_newline[:, None, None].expand(*patch_embeds.shape[:-1], 1)
+                        ), dim=-1)
+                if self.anyres_patch_sampling:
+                    patch_embeds = patch_embeds.view(-1, num_patch_height, num_patch_width, height*width)
+                    patch_embeds = patch_embeds.flatten(1, 2).permute(1, 2, 0)
+                    assert patch_attn_mask is not None
+                    patch_attn_mask = patch_attn_mask.view(num_patch_height, num_patch_width, height*width)
+                    patch_attn_mask = patch_attn_mask.flatten(0, 1)
+                    patch_embeds = torch.cat((base_patch_embeds.unsqueeze(0), patch_embeds), dim=0)
+                    patch_attn_mask = torch.cat((torch.ones(n_vis_token_per_patch, device=patch_embeds.device).unsqueeze(0), patch_attn_mask), dim=0)
+                else:
+                    patch_embeds = patch_embeds.flatten(1, 2).transpose(0, 1)
+                    patch_embeds = torch.cat((base_patch_embeds, patch_embeds), dim=0)
+            else:
+                patch_embeds = patch_embeds[0].unsqueeze(0) if self.anyres_patch_sampling else patch_embeds[0]
+                patch_attn_mask = torch.ones(n_vis_token_per_patch, device=patch_embeds.device).unsqueeze(0) if self.anyres_patch_sampling else None
+                if hasattr(self, 'image_newline'):
+                    patch_embeds = torch.cat((
+                            patch_embeds,
+                            self.image_newline[None]
+                        ), dim=0)
+            if not self.anyres_patch_sampling:
+                max_n_img_token = max(patch_embeds.shape[0], max_n_img_token)
+            new_image_embeds.append(patch_embeds)
+            patch_attn_masks.append(patch_attn_mask)
+        if self.anyres_patch_sampling:
+            # Return individual patches for independent token downsampling.
+            return new_image_embeds, patch_attn_masks
+        # 4. Pad and concat the list of image_embeds [N_tok_i, C] together into a batch. Also modify the query attention mask.
+        image_embeds = []
+        image_atts = []
+        for image_embed in new_image_embeds:
+            n_img_token = image_embed.shape[0]
+            img_attn = torch.ones((max_n_img_token), dtype=torch.long, device=image_embed.device)
+            if n_img_token < max_n_img_token:
+                padded_embed = torch.zeros((max_n_img_token, image_embed.shape[-1]), dtype=image_embed.dtype, device=image_embed.device)
+                padded_embed[:n_img_token, :] = image_embed
+                img_attn[n_img_token:] = 0 # Mask out the padded entries.
+            else:
+                padded_embed = image_embed
+            image_embeds.append(padded_embed)
+            image_atts.append(img_attn)
+        image_embeds = torch.stack(image_embeds, dim=0) # Shape [B, N_tok_longest, C_dim]
+        image_atts = torch.stack(image_atts, dim=0) # Shape [B, N_tok_longest, C_dim]
+        # TODO: reshape image_embeds and image_atts to "b T F v d"
+        image_embeds = image_embeds[:, None, None, :, :]
+        # image_atts = image_atts[:, None, None, :, :]
+        return image_embeds, image_atts
+    def _encode_vision_x(self, vision_x: torch.Tensor):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x: Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        with torch.no_grad():
+            if self.vision_encoder.__class__.__name__ == "TimmModel":
+                vision_x = self.vision_encoder.trunk.forward_features(vision_x)
+            elif self.vision_encoder.__class__.__name__ == 'CLIPVisionModel':
+                vision_x = self.vision_encoder(vision_x).last_hidden_state
+            else:
+                vision_x = self.vision_encoder(vision_x)[1]  # OpenCLIP returns tuples
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        return vision_x
+    def _concat_vision_cache(
+        self, lang_x, vision_tokens, past_vision_tokens, past_media_locations, use_cache
+    ):
+        """
+        Helper function to include the past vision tokens and past media locations in the output.
+        """
+        if use_cache:
+            if past_media_locations is not None and past_vision_tokens is not None:
+                if vision_tokens is not None:
+                    updated_vision_tokens = torch.cat(
+                        [
+                            past_vision_tokens,
+                            vision_tokens,
+                        ],
+                        dim=1,
+                    )
+                else:
+                    updated_vision_tokens = past_vision_tokens
+                updated_media_locations = torch.cat(
+                    [
+                        past_media_locations,
+                        lang_x == self.media_token_id,
+                    ],
+                    dim=1,
+                )
+            else:
+                updated_vision_tokens = vision_tokens
+                updated_media_locations = lang_x == self.media_token_id
+        else:
+            updated_vision_tokens = None
+            updated_media_locations = None
+        return updated_vision_tokens, updated_media_locations
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                see documentation for forward
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            **kwargs: see generate documentation in Hugging Face CausalLM models.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        # convert pixels to vision tokens
+        if vision_x is not None:
+            vision_features = self._encode_vision_x(vision_x=vision_x)
+            vision_tokens = self.vision_tokenizer(vision_features)
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        # for xattn, vision_x and media_location are repeat_interleaved s.t.
+        # the total batch size is B * num_beams
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            past_vision_tokens=past_vision_tokens,
+            padding_side="left",
+            num_beams=num_beams,
+        )
+        output = self.lang_model.generate(
+            **new_inputs,
+            past_key_values=past_key_values,
+            num_beams=num_beams,
+            use_cache=True,
+            **kwargs,
+        )
+        self._post_forward_hook()
+        return output
+    @property
+    def num_trainable_params(self):
+        """Print the number of trainable parameters"""
+        return num_params(self, filter_to_trainable=True)
+    def set_trainable(self):
+        """
+        Freeze appropriate parameters in the model.
+        """
+        raise NotImplementedError
+    def group_params_by_weight_decay(self):
+        """
+        Return a tuple of (params to optimize w/ weight decay, params to optimize w/o weight decay)
+        """
+        params_with_wd, params_without_wd = [], []
+        for n, p in self.named_parameters():
+            if p.requires_grad:
+                if self._should_apply_weight_decay(n):
+                    params_with_wd.append(p)
+                else:
+                    params_without_wd.append(p)
+        return params_with_wd, params_without_wd
+    def _should_apply_weight_decay(self, parameter_name):
+        """
+        Return whether weight decay should be applied to a parameter.
+        """
+        raise NotImplementedError
+    @property
+    def special_tokens(self):
+        """
+        Returns a dict mapping from the attribute name of a special token to its string format,
+         e.g. "media_token": "<image>"
+        """
+        assert (
+            "media_token" in self._special_tokens
+        ), "VLMs need to request that the tokenizer add a media_token and call set_special_token_ids to set self.media_token_id"
+        return self._special_tokens
+    @property
+    def special_token_ids(self):
+        """
+        Returns a list of the special token ids
+        """
+        return [getattr(self, f"{att_name}_id") for att_name in self.special_tokens]
+    def set_special_token_ids(self, string_to_ids):
+        """
+        Args:
+            string_to_ids (dict): mapping from token string to id
+        """
+        assert set(self.special_tokens.values()).issubset(set(string_to_ids.keys()))
+        for att_name, token_str in self.special_tokens.items():
+            token_id = string_to_ids[token_str]
+            setattr(self, f"{att_name}_id", token_id)
+            setattr(self.lang_model, f"{att_name}_id", token_id)
+    def init_gradient_checkpointing(self):
+        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+            checkpoint_wrapper,
+            CheckpointWrapper,
+            CheckpointImpl,
+            apply_activation_checkpointing,
+        )
+        from functools import partial
+        non_reentrant_wrapper = partial(
+            checkpoint_wrapper,
+            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+        )
+        apply_activation_checkpointing(
+            self,
+            checkpoint_wrapper_fn=non_reentrant_wrapper,
+            check_fn=lambda m: getattr(m, "_use_gradient_checkpointing", False)
+            and not isinstance(m, CheckpointWrapper),
+        )
+@dataclass
+class VLMOutputWithPast(CausalLMOutputWithPast):
+    """
+    VLMOutputWithPast is a wrapper around CausalLMOutputWithPast that adds the following attributes:
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+    """
+    past_media_locations: Optional[torch.Tensor] = None
+    past_vision_tokens: Optional[torch.Tensor] = None
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class VLMWithLanguageStream(VLM):
+    """
+    VLM that fuses modalities by inserting vision tokens directly into the language stream.
+    """
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_tokenizer: nn.Module,
+        lang_model: nn.Module,
+        initial_tokenizer_len: int,
+        pad_token_id: int,
+        decoder_layers_attr_name: str = None,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__(
+            vision_encoder=vision_encoder,
+            vision_tokenizer=vision_tokenizer,
+            lang_model=lang_model,
+            initial_tokenizer_len=initial_tokenizer_len,
+            pad_token_id=pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+        if decoder_layers_attr_name is not None:
+            for block in getattr_recursive(self.lang_model, self.decoder_layers_attr_name):
+                block._use_gradient_checkpointing = gradient_checkpointing
+    def _prepare_inputs_for_forward(
+        self,
+        vision_tokens: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: torch.Tensor = None,
+        past_key_values=None,
+        vision_attention_mask: Optional[torch.Tensor] = None,
+        past_media_locations: torch.Tensor = None,
+        past_vision_tokens: torch.Tensor = None,
+        padding_side: str = "left",
+        num_beams: int = 1,
+    ):
+        """
+        Insert the vision tokens directly into the language stream/
+        This requires us to modify the input_ids, attention_mask, and labels.
+        """
+        if past_key_values is not None:
+            past_len = past_key_values[0][0].shape[2]
+            assert attention_mask.shape[1] == past_len + lang_x.shape[1], (
+                "Attention_mask must be as long as the entire past len (including image tokens) and current input IDs. "
+                + "Check that you've expanded the attention mask to account for past image tokens."
+            )
+        if vision_tokens is None:
+            return {
+                "input_ids": lang_x,
+                "attention_mask": attention_mask,
+                "labels": labels,
+            }
+        # get the language embeddings
+        lang_embeds = self.lang_model.get_input_embeddings()(lang_x)
+        # build up the multimodal embeddings
+        B = lang_x.shape[0]
+        has_labels = labels is not None
+        multimodal_embeds = []
+        multimodal_attention_mask = []
+        multimodal_labels = [] if has_labels else None
+        for i in range(B):
+            # get index of <image> tokens in lang_x[i]
+            image_token_idxs = torch.where(lang_x[i] == self.media_token_id)[0]
+            if len(image_token_idxs) == 0:
+                multimodal_embeds.append(lang_embeds[i].clone())
+                multimodal_attention_mask.append(attention_mask[i].clone())
+                if has_labels:
+                    multimodal_labels.append(labels[i].clone())
+                continue
+            # since an image is represented by self.num_tokens_per_vis tokens, we need to offset the image_token_idxs
+            for j, img_idx in enumerate(image_token_idxs):
+                image_token_idxs[j] += (self.num_tokens_per_vis - 1) * j # FIXME: different offset for any resolution encoding when has multiple images.
+            # loop through the image_token_idxs and insert the vision tokens
+            new_embed = lang_embeds[i].clone()
+            new_attention_mask = (
+                attention_mask[i].clone() if attention_mask is not None else None
+            )
+            if has_labels:
+                new_label = labels[i].clone()
+            for img_num, img_idx in enumerate(image_token_idxs):
+                if img_num > 0:
+                    # FIXME: hardcoded as such to avoid assertion error, but this only works for single image samples.
+                    break
+                # Get vision token attention mask for padded llava-style any resolution image tokens.
+                if self.image_aspect_ratio =='anyres':
+                    num_vis_tokens = vision_tokens[i][img_num].shape[0]
+                    if vision_attention_mask is not None:
+                        vis_attention_mask = vision_attention_mask[i]
+                    else:
+                        vis_attention_mask = torch.ones(
+                            num_vis_tokens, dtype=torch.long
+                        ).to(attention_mask.device)
+                else:
+                    assert (
+                        vision_tokens[i][img_num].shape[0] == self.num_tokens_per_vis
+                    ), f"vision token number mismatch: image embedding ({vision_tokens[i][img_num].shape[0]}) \
+                            vs. model.num_tokens_per_vis ({self.num_tokens_per_vis})"
+                    # By default, vision tokens are not padded.
+                    num_vis_tokens = self.num_tokens_per_vis
+                    vis_attention_mask = torch.ones(
+                        num_vis_tokens, dtype=torch.long
+                    ).to(attention_mask.device)
+                new_embed = torch.cat(
+                    (
+                        new_embed[:img_idx],
+                        vision_tokens[i][img_num],
+                        new_embed[img_idx + 1 :],
+                    ),
+                    dim=0,
+                )
+                new_attention_mask = torch.cat(
+                    (
+                        new_attention_mask[:img_idx],
+                        vis_attention_mask,
+                        new_attention_mask[img_idx + 1 :],
+                    ),
+                    dim=0,
+                )
+                if has_labels:
+                    new_label = torch.cat(
+                        (
+                            new_label[:img_idx],
+                            torch.ones(num_vis_tokens, dtype=torch.long).to(
+                                labels.device
+                            )
+                            * -100,
+                            new_label[img_idx + 1 :],
+                        ),
+                        dim=0,
+                    )
+            multimodal_embeds.append(new_embed)
+            multimodal_attention_mask.append(new_attention_mask)
+            if has_labels:
+                multimodal_labels.append(new_label)
+        # stack
+        multimodal_embeds = stack_with_padding(
+            multimodal_embeds,
+            padding_value=self.pad_token_id,
+            padding_side=padding_side,
+        )
+        multimodal_attention_mask = stack_with_padding(
+            multimodal_attention_mask,
+            padding_value=0,
+            padding_side=padding_side,
+        )
+        if has_labels:
+            multimodal_labels = stack_with_padding(
+                multimodal_labels,
+                padding_value=-100,
+                padding_side=padding_side,
+            )
+        return {
+            "inputs_embeds": multimodal_embeds,
+            "attention_mask": multimodal_attention_mask,
+            "labels": multimodal_labels,
+        }
+    def _postprocess_outputs_from_forward(
+        self,
+        output: CausalLMOutputWithPast,
+        lang_x: torch.Tensor,
+        vision_tokens: torch.Tensor,
+        past_vision_tokens: torch.Tensor,
+        past_media_locations: torch.Tensor,
+        use_cache: bool = False,
+    ):
+        # Include the past vision tokens and past media locations in the output
+        updated_vision_tokens, updated_media_locations = self._concat_vision_cache(
+            lang_x=lang_x,
+            vision_tokens=vision_tokens,
+            past_vision_tokens=past_vision_tokens,
+            past_media_locations=past_media_locations,
+            use_cache=use_cache,
+        )
+        # return logits that are the same shape as the original input_ids
+        logits = output.logits
+        batch_logits = []
+        B, T_txt = lang_x.shape
+        for i in range(B):
+            sequence_logits = []
+            logits_j = 0
+            for j in range(T_txt):
+                if lang_x[i, j] != self.media_token_id:
+                    sequence_logits.append(logits[i, logits_j])
+                    logits_j += 1
+                else:
+                    # append the logit for the first image token, then skip over the rest
+                    # note: the model actually learns to predict <im_patch>, not <image>
+                    sequence_logits.append(logits[i, logits_j])
+                    logits_j += self.num_tokens_per_vis
+            sequence_logits = torch.stack(sequence_logits, dim=0)  # (B, vocab_size)
+            batch_logits.append(sequence_logits)
+        batch_logits = torch.stack(batch_logits, dim=0)  # (B, T_txt, vocab_size)
+        # The final logits shape should be the same as the original input_ids shape
+        assert batch_logits.shape[:2] == (B, T_txt)
+        # assemble the output
+        output = VLMOutputWithPast(
+            loss=output.loss,
+            logits=batch_logits,
+            past_key_values=output.past_key_values,
+            hidden_states=output.hidden_states,
+            attentions=output.attentions,
+            past_media_locations=updated_media_locations,
+            past_vision_tokens=updated_vision_tokens,
+        )
+        return output
+    def _post_forward_hook(self):
+        pass
+    @property
+    def num_params_per_module(self):
+        """Print the number of parameters per module in the model"""
+        return "\n".join(
+            [
+                f"Vision encoder: {num_params(self.vision_encoder):,} parameters",
+                f"Vision tokenizer: {num_params(self.vision_tokenizer):,} parameters",
+                f"Language model: {num_params(self.lang_model):,} parameters",
+            ]
+        )
+    @property
+    def num_trainable_params_per_module(self):
+        """Print the number of trainable parameters per module in the model"""
+        return "\n".join(
+            [
+                f"Vision encoder: {num_params(self.vision_encoder, filter_to_trainable=True):,} trainable parameters",
+                f"Vision tokenizer: {num_params(self.vision_tokenizer, filter_to_trainable=True):,} trainable parameters",
+                f"Language model: {num_params(self.lang_model, filter_to_trainable=True):,} trainable parameters",
+            ]
+        )
+class KosmosInstruct(VLMWithLanguageStream):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_tokenizer: nn.Module,
+        lang_model: nn.Module,
+        initial_tokenizer_len: int,
+        pad_token_id: int,
+        decoder_layers_attr_name: str = None,
+        gradient_checkpointing: bool = False,
+        image_aspect_ratio: str = 'pad',
+        anyres_patch_sampling: bool = False
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            vis_feature_dim (int): final dimension of the visual features outputted by the vision_encoder
+            initial_tokenizer_len (int): size of the tokenizer vocab
+            padding_token_id (int): id of the padding token. None if no padding token; then a padding token
+                will be inserted into self.special_tokens, which factory.py fills after creating new tokens
+            decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+            gradient_checkpointing (bool, optional): whether to use gradient checkpointing. Defaults to False.
+        """
+        self._special_tokens = {
+            "media_token": "<image>",
+            "image_placeholder_token": "<image placeholder>",
+            "end_of_trunk_token": "<|endofchunk|>",
+        }
+        lang_embedding_dim = lang_model.get_input_embeddings().weight.shape[1]
+        super().__init__(
+            vision_encoder=vision_encoder,
+            vision_tokenizer=vision_tokenizer,
+            lang_model=lang_model,
+            initial_tokenizer_len=initial_tokenizer_len,
+            gradient_checkpointing=gradient_checkpointing,
+            decoder_layers_attr_name=decoder_layers_attr_name,
+            pad_token_id=pad_token_id,
+        )
+        self.image_aspect_ratio = image_aspect_ratio
+        self.anyres_patch_sampling = anyres_patch_sampling
+    def set_trainable(self):
+        """
+        Unfreeze everything except the vision_encoder
+        """
+        self.requires_grad_(True)
+        self.vision_encoder.requires_grad_(False)
+    def _should_apply_weight_decay(self, parameter_name):
+        """
+        Kosmos applies 0.01 weight deacy to everything
+        """
+        return True
+    def forward(
+        self,
+        vision_x: Optional[torch.Tensor],
+        lang_x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        image_size: Optional[Tuple] = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            vision_x: Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+                only F = 1 is supported (single-frame videos)
+                if T_img > the number of media tokens in the corresponding input_ids (lang_x),
+                only the first number of media tokens in lang_x are used
+            lang_x: Language input ids, with media tokens denoting where
+                visual media should be inserted.
+                shape (B, T_txt)
+            attention_mask: Attention mask. Defaults to None.
+            labels: Labels. Defaults to None.
+                shape (B, T_txt)
+            past_key_values (Tuple[torch.Tensor]], optional): Past key value pairs for each of the T_txt previous tokens in the language model. Defaults to None.
+                list of length = number of decoder layers in the LM
+                exact implementation depends on LM, see Hugging Face docs
+            past_media_locations (torch.Tensor, optional): boolean mask denoting which of the previous T_txt tokens were media tokens. Defaults to None.
+                shape (B, T_txt)
+            past_vision_tokens (torch.Tensor, optional): Previous vision tokens. Defaults to None.
+            use_cache (Optional[bool], optional): Whether to use cache. Defaults to False.
+                If True, includes key_values, media_locations, and vision_tokens in the output.
+        """
+        assert not (past_vision_tokens is None) ^ (
+            past_media_locations is None
+        ), "past_vision_tokens and past_media_locations must both be None or both be not None"
+        # convert pixels to vision tokens
+        vision_attention_mask = None
+        if vision_x is not None:
+            if self.image_aspect_ratio == 'anyres':
+                input_dict = dict(image=vision_x, image_size=image_size)
+                vision_features, vision_attn_masks = self._encode_vision_x_anyres(input_dict, lang_x.device)
+            else:
+                vision_features = self._encode_vision_x(vision_x=vision_x)
+                vision_attn_masks = None
+            if self.anyres_patch_sampling:
+                split_sizes = [feature.shape[0] for feature in vision_features]
+                vision_features = torch.cat(vision_features, dim=0)
+                vision_features = vision_features[:, None, None, :, :] # Expand dimensions.
+                vision_attn_masks = torch.cat(vision_attn_masks, dim=0)
+            # Prepare text embeds for instruction-aware image query sampling.
+            # FIXME: for debugging purposed, truncating text input to vision tokenizer to be 256 at max.
+            lang_x_truncated = lang_x[:, :256]
+            text_embeds = self.lang_model.get_input_embeddings()(lang_x_truncated)
+            # TODO: repeat text_embeds to match the number of patches for each image patch group.
+            if self.anyres_patch_sampling:
+                repeated_text_embeds = []
+                for i, np in enumerate(split_sizes):
+                    repeated_text_embeds.append(text_embeds[i].repeat(np, 1, 1))
+                text_embeds = torch.cat(repeated_text_embeds, dim=0)
+            vision_tokens = self.vision_tokenizer(vision_features, text_embeds)
+            # Post-processing: Split the batches into groups of patches and concatenate them together.
+            if self.anyres_patch_sampling:
+                vision_token_groups = torch.split(vision_tokens, split_sizes, dim=0)
+                max_n_vis_token = max([vis.shape[0]*vis.shape[-2] for vis in vision_token_groups])
+                # Padding.
+                padded_vision_tokens = []
+                padded_attn_masks = []
+                for patch_vis_tokens in vision_token_groups:
+                    patch_vis_tokens = patch_vis_tokens.flatten(0, 2) # [Np, 1, v, d] -> [Np*v, d]
+                    n_vis_token = patch_vis_tokens.shape[0]
+                    patch_attn = torch.ones((max_n_vis_token), dtype=torch.long, device=patch_vis_tokens.device)
+                    if n_vis_token < max_n_vis_token:
+                        padded_vis_token = torch.zeros((max_n_vis_token, patch_vis_tokens.shape[-1]),
+                                                       dtype=patch_vis_tokens.dtype, device=patch_vis_tokens.device)
+                        padded_vis_token[:n_vis_token, :] = patch_vis_tokens
+                        patch_attn[n_vis_token:] = 0
+                    else:
+                        padded_vis_token = patch_vis_tokens
+                    padded_vision_tokens.append(padded_vis_token)
+                    padded_attn_masks.append(patch_attn)
+                vision_tokens = torch.stack(padded_vision_tokens, dim=0)
+                vision_attention_mask = torch.stack(padded_attn_masks, dim=0)
+                vision_tokens = vision_tokens[:, None, :, :]
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            vision_attention_mask=vision_attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            padding_side="right",
+            past_vision_tokens=past_vision_tokens,
+        )
+        output = self.lang_model(
+            **new_inputs,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        # postprocessing may be needed, e.g. to remove extra tokens from logits that were inserted into the language stream
+        # or to add the past_vision_tokens and past_media_locations to the output
+        output = self._postprocess_outputs_from_forward(
+            output=output,
+            lang_x=lang_x,
+            vision_tokens=vision_tokens,
+            use_cache=use_cache,
+            past_vision_tokens=past_vision_tokens,
+            past_media_locations=past_media_locations,
+        )
+        # postforward hooks
+        self._post_forward_hook()
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        image_size: Optional[Tuple] = None,
+        attention_mask: torch.Tensor = None,
+        past_key_values: Optional[
+            List[Union[torch.Tensor, Tuple[torch.Tensor]]]
+        ] = None,
+        past_media_locations: Optional[torch.Tensor] = None,
+        past_vision_tokens: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                see documentation for forward
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            **kwargs: see generate documentation in Hugging Face CausalLM models.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        # convert pixels to vision tokens
+        vision_attention_mask = None
+        if vision_x is not None:
+            if self.image_aspect_ratio == 'anyres':
+                input_dict = dict(image=vision_x, image_size=image_size)
+                vision_features, vision_attn_masks = self._encode_vision_x_anyres(input_dict, lang_x.device)
+            else:
+                vision_features = self._encode_vision_x(vision_x=vision_x)
+                vision_attn_masks = None
+            if self.anyres_patch_sampling:
+                split_sizes = [feature.shape[0] for feature in vision_features]
+                vision_features = torch.cat(vision_features, dim=0)
+                vision_features = vision_features[:, None, None, :, :] # Expand dimensions.
+                vision_attn_masks = torch.cat(vision_attn_masks, dim=0)
+            # Prepare text embeds for instruction-aware image query sampling.
+            lang_x_truncated = lang_x[:, :256]
+            text_embeds = self.lang_model.get_input_embeddings()(lang_x_truncated) # FIXME: check function calling.
+            # Repeat text_embeds to match the number of patches for each image patch group.
+            if self.anyres_patch_sampling:
+                repeated_text_embeds = []
+                for i, np in enumerate(split_sizes):
+                    repeated_text_embeds.append(text_embeds[i].repeat(np, 1, 1))
+                text_embeds = torch.cat(repeated_text_embeds, dim=0)
+            vision_tokens = self.vision_tokenizer(vision_features, text_embeds)
+            # Post-processing: Split the batches into groups of patches and concatenate them together.
+            if self.anyres_patch_sampling:
+                vision_token_groups = torch.split(vision_tokens, split_sizes, dim=0)
+                max_n_vis_token = max([vis.shape[0]*vis.shape[-2] for vis in vision_token_groups])
+                # Padding.
+                padded_vision_tokens = []
+                padded_attn_masks = []
+                for patch_vis_tokens in vision_token_groups:
+                    patch_vis_tokens = patch_vis_tokens.flatten(0, 2) # [Np, 1, v, d] -> [Np*v, d]
+                    n_vis_token = patch_vis_tokens.shape[0]
+                    patch_attn = torch.ones((max_n_vis_token), dtype=torch.long, device=patch_vis_tokens.device)
+                    if n_vis_token < max_n_vis_token:
+                        padded_vis_token = torch.zeros((max_n_vis_token, patch_vis_tokens.shape[-1]),
+                                                       dtype=patch_vis_tokens.dtype, device=patch_vis_tokens.device)
+                        padded_vis_token[:n_vis_token, :] = patch_vis_tokens
+                        patch_attn[n_vis_token:] = 0
+                    else:
+                        padded_vis_token = patch_vis_tokens
+                    padded_vision_tokens.append(padded_vis_token)
+                    padded_attn_masks.append(patch_attn)
+                vision_tokens = torch.stack(padded_vision_tokens, dim=0)
+                vision_attention_mask = torch.stack(padded_attn_masks, dim=0)
+                vision_tokens = vision_tokens[:, None, :, :]
+        else:
+            vision_tokens = None
+        # fuse the vision and language tokens
+        # for xattn, vision_x and media_location are repeat_interleaved s.t.
+        # the total batch size is B * num_beams
+        new_inputs = self._prepare_inputs_for_forward(
+            vision_tokens=vision_tokens,
+            lang_x=lang_x,
+            attention_mask=attention_mask,
+            vision_attention_mask=vision_attention_mask,
+            past_key_values=past_key_values,
+            past_media_locations=past_media_locations,
+            past_vision_tokens=past_vision_tokens,
+            padding_side="left",
+            num_beams=num_beams,
+        )
+        if transformers.__version__ == '4.41.0.dev0':
+            output = self.lang_model.generate(
+                **new_inputs,
+                num_beams=num_beams,
+                use_cache=True,
+                **kwargs,
+            )
+        else:
+            output = self.lang_model.generate(
+                **new_inputs,
+                past_key_values=past_key_values,
+                num_beams=num_beams,
+                use_cache=True,
+                **kwargs,
+            )
+        self._post_forward_hook()
+        return output