Ligeng-Zhu commited on Dec 5, 2024

Commit

d78503a

verified ·

1 Parent(s): 281b01d

Upload files with `vila-upload`.

Upload config.json
Upload configuration_llava.py
Upload trainer_state.json
Upload constants.py
Upload utils.py
Upload builder.py
Upload llava_llama.py
Upload llava_arch.py
Upload README.md
Upload llm/tokenizer_config.json
Upload llm/config.json
Upload llm/generation_config.json
Upload llm/special_tokens_map.json
Upload llm/model.safetensors.index.json
Upload llm/model-00002-of-00002.safetensors
Upload llm/model-00001-of-00002.safetensors
Upload llm/tokenizer.model
Upload mm_projector/config.json
Upload mm_projector/model.safetensors
Upload vision_tower/config.json
Upload vision_tower/model.safetensors
Upload vision_tower/preprocessor_config.json

Files changed (22) hide show

README.md +56 -0
builder.py +293 -0
config.json +257 -0
configuration_llava.py +53 -0
constants.py +31 -0
llava_arch.py +1552 -0
llava_llama.py +1193 -0
llm/config.json +32 -0
llm/generation_config.json +7 -0
llm/model-00001-of-00002.safetensors +3 -0
llm/model-00002-of-00002.safetensors +3 -0
llm/model.safetensors.index.json +298 -0
llm/special_tokens_map.json +24 -0
llm/tokenizer.model +3 -0
llm/tokenizer_config.json +43 -0
mm_projector/config.json +10 -0
mm_projector/model.safetensors +3 -0
trainer_state.json +0 -0
utils.py +96 -0
vision_tower/config.json +19 -0
vision_tower/model.safetensors +3 -0
vision_tower/preprocessor_config.json +24 -0

README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+---
+license: cc-by-nc-4.0
+library_name: transformers
+pipeline_tag: text-generation
+tags:
+- VILA
+- VLM
+---
+# VILA Model Card
+## Model details
+**Model type:**
+VILA is a visual language model (VLM) pretrained with interleaved image-text data at scale, enabling multi-image VLM. VILA is deployable on the edge, including Jetson Orin and laptop by AWQ 4bit quantization through TinyChat framework. We find: (1) image-text pairs are not enough, interleaved image-text is essential; (2) unfreezing LLM during interleaved image-text pre-training enables in-context learning; (3)re-blending text-only instruction data is crucial to boost both VLM and text-only performance. VILA unveils appealing capabilities, including: multi-image reasoning, in-context learning, visual chain-of-thought, and better world knowledge.
+**Model date:**
+VILA1.5-3b was trained in May 2024.
+**Paper or resources for more information:**
+https://github.com/Efficient-Large-Model/VILA
+```
+@misc{lin2023vila,
+      title={VILA: On Pre-training for Visual Language Models},
+      author={Ji Lin and Hongxu Yin and Wei Ping and Yao Lu and Pavlo Molchanov and Andrew Tao and Huizi Mao and Jan Kautz and Mohammad Shoeybi and Song Han},
+      year={2023},
+      eprint={2312.07533},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## License
+- The code is released under the Apache 2.0 license as found in the [LICENSE](./LICENSE) file.
+- The pretrained weights are released under the [CC-BY-NC-SA-4.0 license](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en).
+- The service is a research preview intended for non-commercial use only, and is subject to the following licenses and terms:
+    - [Model License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA
+    - [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI
+    - [Dataset Licenses](https://github.com/Efficient-Large-Model/VILA/blob/main/data_prepare/LICENSE) for each one used during training.
+**Where to send questions or comments about the model:**
+https://github.com/Efficient-Large-Model/VILA/issues
+## Intended use
+**Primary intended uses:**
+The primary use of VILA is research on large multimodal models and chatbots.
+**Primary intended users:**
+The primary intended users of the model are researchers and hobbyists in computer vision, natural language processing, machine learning, and artificial intelligence.
+## Training dataset
+See [Dataset Preparation](https://github.com/Efficient-Large-Model/VILA/blob/main/data_prepare/README.md) for more details.
+## Evaluation dataset
+A collection of 12 benchmarks, including 5 academic VQA benchmarks and 7 recent benchmarks specifically proposed for instruction-following LMMs.

builder.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import shutil
+import warnings
+import torch
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, PretrainedConfig)
+from .llava_llama import LlavaLlamaModel
+# from llava.model import *
+# from llava.model.utils import is_mm_model
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+def is_mm_model(model_path):
+    """
+    Check if the model at the given path is a visual language model.
+    Args:
+        model_path (str): The path to the model.
+    Returns:
+        bool: True if the model is an MM model, False otherwise.
+    """
+    config = AutoConfig.from_pretrained(model_path)
+    architectures = config.architectures
+    for architecture in architectures:
+        if "llava" in architecture.lower():
+            return True
+    return False
+def load_pretrained_model(
+    model_path,
+    model_name,
+    model_base=None,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="auto",
+    device="cuda",
+    **kwargs,
+):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs["device_map"] = {"": device}
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.float16
+        # kwargs["torch_dtype"] = torch.bfloat16
+    if is_mm_model(model_path):
+        # Load LLaVA model
+        ## TODO @yunhao: mind fixing lora
+        if "lora" in model_name.lower() and model_base is None:
+            warnings.warn(
+                "There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged."
+            )
+        if (
+            "lora" in model_name.lower() or "dora" in model_name.lower()
+        ) and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            print(lora_cfg_pretrained)
+            print("Loading LLaVA from base model...")
+            config = AutoConfig.from_pretrained(model_base)
+            prepare_config_for_eval(config, kwargs)
+            model = LlavaLlamaModel.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=config, **kwargs
+            )
+            tokenizer = model.tokenizer
+            token_num, tokem_dim = (
+                model.llm.lm_head.out_features,
+                model.llm.lm_head.in_features,
+            )
+            if model.llm.lm_head.weight.shape[0] != token_num:
+                model.llm.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+                model.llm.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+            print("Loading additional LLaVA weights...")
+            if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+                non_lora_trainables = torch.load(
+                    os.path.join(model_path, "non_lora_trainables.bin"),
+                    map_location="cpu",
+                )
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id, filename=filename, subfolder=subfolder
+                    )
+                    return torch.load(cache_file, map_location="cpu")
+                non_lora_trainables = load_from_hf(
+                    model_path, "non_lora_trainables.bin"
+                )
+            non_lora_trainables = {
+                (k[11:] if k.startswith("base_model.") else k): v
+                for k, v in non_lora_trainables.items()
+            }
+            if any(k.startswith("model.model.") for k in non_lora_trainables):
+                non_lora_trainables = {
+                    (k[6:] if k.startswith("model.") else k): v
+                    for k, v in non_lora_trainables.items()
+                }
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print("Loading LoRA weights...")
+            model = PeftModel.from_pretrained(model, model_path)
+            print("Merging LoRA weights...")
+            model = model.merge_and_unload()
+            print("Model is loaded...")
+        ## TODO @yunhao: mind fixing this
+        elif model_base is not None:
+            # this may be mm projector only
+            print("Loading LLaVA from base model...")
+            cfg_pretrained = AutoConfig.from_pretrained(
+                model_path, trust_remote_code=True
+            )
+            mm_config_wrapper(config, kwargs)
+            if "mpt" in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, "configuration_mpt.py")):
+                    shutil.copyfile(
+                        os.path.join(model_base, "configuration_mpt.py"),
+                        os.path.join(model_path, "configuration_mpt.py"),
+                    )
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                model = LlavaMPTForCausalLM.from_pretrained(
+                    model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_base, use_fast=False, legacy=False
+                )
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+                )
+        else:
+            config = AutoConfig.from_pretrained(model_path)
+            config.resume_path = model_path
+            prepare_config_for_eval(config, kwargs)
+            if "mpt" in model_name.lower():
+                model = LlavaMPTForCausalLM.from_pretrained(
+                    model_path, config=config, low_cpu_mem_usage=True, **kwargs
+                )
+            elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
+                model = LlavaMistralForCausalLM.from_pretrained(
+                    model_path, config=config, low_cpu_mem_usage=True, **kwargs
+                )
+            elif "gemma" in model_name.lower():
+                model = LlavaGemmaForCausalLM.from_pretrained(
+                    model_path, config=config, low_cpu_mem_usage=True, **kwargs
+                )
+            else:
+                # kentang-mit@: llama-2 model
+                # config._attn_implementation = "flash_attention_2"
+                model = LlavaLlamaModel(config=config, low_cpu_mem_usage=True, **kwargs)
+            tokenizer = model.tokenizer
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, **kwargs
+            )
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print("Convert to FP16...")
+            model.to(torch.float16)
+        else:
+            if "mpt" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_path, use_fast=False, legacy=False
+                )
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+    model.eval()
+    image_processor = None
+    if is_mm_model(model_path):
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(device=device, dtype=torch.float16)
+        # vision_tower.to(device=device, dtype=torch.bfloat16)
+        mm_projector = model.get_mm_projector()
+        mm_projector.to(device=device, dtype=torch.float16)
+        # mm_projector.to(device=device, dtype=torch.bfloat16)
+        image_processor = vision_tower.image_processor
+    if hasattr(model.llm.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len
+def parse_model_name_or_path(config: PretrainedConfig, model_name="llm", suffix="_cfg"):
+    target_model = f"{model_name}{suffix}"
+    target_cfg = getattr(config, target_model, None)
+    if isinstance(target_cfg, str):
+        return target_cfg
+    elif isinstance(target_cfg, dict):
+        return target_cfg["architectures"][0]
+    else:
+        raise ValueError(f"Invalid {target_model} configuration!")
+def prepare_config_for_eval(config: PretrainedConfig, kwargs: dict):
+    try:
+        # compatible with deprecated config convention
+        if getattr(config, "vision_tower_cfg", None) is None:
+            config.vision_tower_cfg = config.mm_vision_tower
+    except AttributeError:
+        raise ValueError(
+            f"Invalid configuration! Cannot find vision_tower in config:\n{config}"
+        )
+    config.model_dtype = kwargs.pop("torch_dtype").__str__()
+    # siglip does not support device_map = "auto"
+    vision_tower_name = parse_model_name_or_path(config, "vision_tower")
+    if "siglip" in vision_tower_name.lower():
+        kwargs["device_map"] = "cuda"

config.json ADDED Viewed

	@@ -0,0 +1,257 @@

+{
+  "_name_or_path": "./vlm",
+  "architectures": [
+    "LlavaLlamaModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "llava_llama.LlavaLlamaConfig",
+    "AutoModel": "llava_llama.LlavaLlamaModel"
+  },
+  "drop_path_rate": 0.0,
+  "hidden_size": 2560,
+  "image_aspect_ratio": "resize",
+  "interpolate_mode": "linear",
+  "llm_cfg": {
+    "_name_or_path": "./llm",
+    "add_cross_attention": false,
+    "architectures": [
+      "LlamaForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 1,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 6912,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 4096,
+    "min_length": 0,
+    "model_max_length": 4096,
+    "model_type": "llama",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 20,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 20,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "pretraining_tp": 1,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "tokenizer_model_max_length": 4096,
+    "tokenizer_padding_side": "right",
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 32000
+  },
+  "mm_hidden_size": 1152,
+  "mm_projector_cfg": {
+    "_name_or_path": "./mm_projector",
+    "add_cross_attention": false,
+    "architectures": [
+      "MultimodalProjector"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "mm_projector_type": "mlp_downsample",
+    "model_type": "v2l_projector",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "mm_projector_lr": null,
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "cls_patch",
+  "mm_vision_select_layer": -2,
+  "model_dtype": "torch.bfloat16",
+  "model_type": "llava_llama",
+  "num_video_frames": 8,
+  "resume_path": "./vlm",
+  "s2": false,
+  "s2_max_split_size": 336,
+  "s2_scales": "336,672,1008",
+  "transformers_version": "4.36.2",
+  "tune_language_model": true,
+  "tune_mm_projector": true,
+  "tune_vision_tower": true,
+  "vision_resolution": -1,
+  "vision_tower_cfg": {
+    "_name_or_path": "./vision_tower",
+    "add_cross_attention": false,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "siglip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

configuration_llava.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from transformers import PretrainedConfig
+class LlavaConfig(PretrainedConfig):
+    model_type = "llava"
+    def __init__(
+        self,
+        llm_cfg=None,
+        vision_tower_cfg=None,
+        mm_projector_cfg=None,
+        architectures=None,
+        resume_path=None,
+        hidden_size=None,
+        mm_hidden_size=None,
+        image_aspect_ratio=None,
+        num_video_frames=None,
+        fps=None,
+        mm_vision_select_layer=None,
+        mm_vision_select_feature=None,
+        mm_use_im_start_end=False,
+        mm_use_im_patch_token=True,
+        mm_projector_lr=None,
+        vision_resolution=None,
+        interpolate_mode=None,
+        s2=None,
+        s2_scales=None,
+        s2_max_split_size=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.architectures = architectures
+        self.llm_cfg = llm_cfg
+        self.vision_tower_cfg = vision_tower_cfg
+        self.mm_projector_cfg = mm_projector_cfg
+        self.resume_path = resume_path
+        self.hidden_size = hidden_size
+        self.mm_hidden_size = mm_hidden_size
+        self.image_aspect_ratio = image_aspect_ratio
+        self.num_video_frames = num_video_frames
+        self.fps = fps
+        self.mm_vision_select_layer = mm_vision_select_layer
+        self.mm_vision_select_feature = mm_vision_select_feature
+        self.mm_use_im_start_end = mm_use_im_start_end
+        self.mm_use_im_start_end = mm_use_im_start_end
+        self.mm_use_im_patch_token = mm_use_im_patch_token
+        self.mm_projector_lr = mm_projector_lr
+        self.vision_resolution = vision_resolution
+        self.interpolate_mode = interpolate_mode
+        self.s2 = s2
+        self.s2_scales = s2_scales
+        self.s2_max_split_size = s2_max_split_size

constants.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

llava_arch.py ADDED Viewed

	@@ -0,0 +1,1552 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import logging
+import math
+import os
+import os.path as osp
+import sys
+import warnings
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import Tuple
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from huggingface_hub import file_exists, repo_exists, snapshot_download
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
+                          AutoTokenizer, BitsAndBytesConfig, PretrainedConfig,
+                          PreTrainedModel, PreTrainedTokenizer)
+from transformers.modeling_utils import ContextManagers, no_init_weights
+from .configuration_llava import LlavaConfig
+# from .constants import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN
+# from .model.language_model.builder import build_llm_and_tokenizer
+# from .model.multimodal_encoder.builder import build_vision_tower
+# from .model.multimodal_projector.builder import build_mm_projector
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import torch
+# from llava.model.multimodal_encoder.vision_encoder import (VisionTower, VisionTowerS2)
+from transformers import CLIPImageProcessor, CLIPVisionModel, PretrainedConfig
+class VisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = getattr(args, "mm_vision_select_layer", -2)
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.cfg_only = None
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def _maybe_resize_pos_embeds(
+        self,
+        model: PreTrainedModel,
+        image_processor,
+        resolution: int = -1,
+        interpolate_mode: str = "linear",
+    ):
+        if resolution in [model.config.image_size, -1]:
+            return
+        print(
+            f"Resizing vision model's position embeddings to support higher vision resolution: from {model.config.image_size} to {resolution} ..."
+        )
+        embeddings = model.vision_model.embeddings
+        patch_size = embeddings.patch_size
+        num_new_tokens = int((resolution // patch_size) ** 2)
+        old_embeddings = embeddings.position_embedding
+        match interpolate_mode:
+            case "linear":
+                ## Step 1: Calculate the corresponding patch ID (pid) in the current resolution (M patches) based on the target resolution (N patches). Formula: pid = pid / N * M
+                ## Step 2:  Obtain new embeddings by interpolating between the embeddings of the two nearest calculated patch IDs. Formula: new_embeds = (pid - floor(pid)) * embeds[ceil(pid)] + (ceil(pid) - pid) * embeds[floor(pid)]
+                import torch
+                import torch.nn as nn
+                if is_deepspeed_zero3_enabled():
+                    import deepspeed
+                    with deepspeed.zero.GatheredParameters(
+                        [old_embeddings.weight], modifier_rank=None
+                    ):
+                        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                else:
+                    old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                new_embeddings = nn.Embedding(
+                    num_new_tokens,
+                    old_embedding_dim,
+                    dtype=old_embeddings.weight.dtype,
+                    device=old_embeddings.weight.device,
+                )
+                mapped_indices = (
+                    torch.arange(num_new_tokens).to(old_embeddings.weight.device)
+                    / (num_new_tokens - 1)
+                    * (old_num_tokens - 1)
+                )
+                floor_indices = torch.clamp(
+                    mapped_indices.floor().long(), min=0, max=old_num_tokens - 1
+                )
+                ceil_indices = torch.clamp(
+                    mapped_indices.ceil().long(), min=0, max=old_num_tokens - 1
+                )
+                if is_deepspeed_zero3_enabled():
+                    params = [old_embeddings.weight, new_embeddings.weight]
+                    with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                        interpolated_embeds = (mapped_indices - floor_indices)[
+                            :, None
+                        ] * old_embeddings.weight.data[ceil_indices, :] + (
+                            ceil_indices - mapped_indices
+                        )[
+                            :, None
+                        ] * old_embeddings.weight.data[
+                            floor_indices, :
+                        ]
+                else:
+                    interpolated_embeds = (mapped_indices - floor_indices)[
+                        :, None
+                    ] * old_embeddings.weight.data[ceil_indices, :] + (
+                        ceil_indices - mapped_indices
+                    )[
+                        :, None
+                    ] * old_embeddings.weight.data[
+                        floor_indices, :
+                    ]
+                new_embeddings.weight.data = interpolated_embeds
+            case _:
+                raise NotImplementedError
+        if hasattr(old_embeddings, "_hf_hook"):
+            hook = old_embeddings._hf_hook
+            add_hook_to_module(new_embeddings, hook)
+        new_embeddings.requires_grad_(old_embeddings.weight.requires_grad)
+        ## update vision encoder's configurations
+        model.config.image_size = resolution
+        if hasattr(image_processor, "crop_size"):
+            # CLIP vision tower
+            image_processor.crop_size = resolution
+        else:
+            # SIGLIP vision tower
+            assert hasattr(image_processor, "size")
+            image_processor.size = {"height": resolution, "width": resolution}
+        ## TODO define a '_reinitialize' method for VisionTower
+        embeddings.position_embedding = new_embeddings
+        embeddings.image_size = resolution
+        embeddings.num_patches = embeddings.num_positions = num_new_tokens
+        embeddings.position_ids = (
+            torch.arange(embeddings.num_positions)
+            .expand((1, -1))
+            .to(old_embeddings.weight.device)
+        )
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class VisionTowerS2(VisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+        self.scales = list(map(int, args.s2_scales.split(",")))
+        self.scales.sort()
+        self.max_split_size = args.s2_max_split_size
+    @torch.no_grad()
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(
+            images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+        )
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = multiscale_forward(
+                    self.forward_feature,
+                    image.unsqueeze(0),
+                    img_sizes=self.scales,
+                    max_split_size=self.max_split_size,
+                )
+                image_features.append(image_feature)
+        else:
+            image_features = multiscale_forward(
+                self.forward_feature,
+                images,
+                img_sizes=self.scales,
+                max_split_size=self.max_split_size,
+            )
+        return image_features
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.scales)
+class CLIPVisionTower(VisionTower):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig):
+        super().__init__(model_name_or_path, config)
+        self.image_processor = CLIPImageProcessor.from_pretrained(model_name_or_path)
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            model_name_or_path, torch_dtype=eval(config.model_dtype)
+        )
+        self.is_loaded = True
+class CLIPVisionTowerS2(VisionTowerS2):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig):
+        super().__init__(model_name_or_path, config)
+        self.image_processor = CLIPImageProcessor.from_pretrained(model_name_or_path)
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            model_name_or_path, torch_dtype=eval(config.model_dtype)
+        )
+        # Make sure it crops/resizes the image to the largest scale in self.scales to maintain high-res information
+        self.image_processor.size["shortest_edge"] = self.scales[-1]
+        self.image_processor.crop_size["height"] = self.image_processor.crop_size[
+            "width"
+        ] = self.scales[-1]
+        self.is_loaded = True
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class DownSampleBlock(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+    def flat_square(self, x):
+        n, w, h, c = x.size()
+        if w % 2 == 1:
+            x = torch.concat(
+                [x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1
+            ).contiguous()
+            n, w, h, c = x.size()
+        if h % 2 == 1:
+            x = torch.concat(
+                [x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2
+            ).contiguous()
+            n, w, h, c = x.size()
+        x = x.view(n, w, int(h / 2), int(c * 2))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+        return x
+class MultimodalProjectorConfig(PretrainedConfig):
+    model_type = "v2l_projector"
+    def __init__(self, mm_projector_type: str = None, **kwargs):
+        super().__init__()
+        self.mm_projector_type = mm_projector_type
+class MultimodalProjector(PreTrainedModel):
+    config_class = MultimodalProjectorConfig
+    def __init__(
+        self, mm_projector_cfg: MultimodalProjectorConfig, config: PretrainedConfig
+    ):
+        super().__init__(mm_projector_cfg)
+        mm_projector_type = mm_projector_cfg.mm_projector_type
+        if mm_projector_type == "identity":
+            self.layers = IdentityMap()
+        elif mm_projector_type == "linear":
+            self.layers = nn.Linear(config.mm_hidden_size, config.hidden_size)
+        elif mm_projector_type == "mlp_downsample":
+            self.layers = nn.Sequential(
+                DownSampleBlock(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", mm_projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU())
+                    modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+                self.layers = nn.Sequential(*modules)
+            else:
+                raise ValueError(f"Unknown projector type: {mm_projector_type}")
+    def forward(self, x, *args, **kwargs):
+        return self.layers(x)
+def build_mm_projector(
+    model_type_or_path: str, config: PretrainedConfig
+) -> PreTrainedModel:
+    if model_type_or_path is None:
+        return None
+    ## load from pretrained model
+    if config.resume_path:
+        assert os.path.exists(
+            model_type_or_path
+        ), f"Resume mm projector path {model_type_or_path} does not exist!"
+        return MultimodalProjector.from_pretrained(
+            model_type_or_path, config, torch_dtype=eval(config.model_dtype)
+        )
+    ## build from scratch
+    else:
+        mm_projector_cfg = MultimodalProjectorConfig(model_type_or_path)
+        mm_projector = MultimodalProjector(mm_projector_cfg, config).to(
+            eval(config.model_dtype)
+        )
+        return mm_projector
+def build_vision_tower(
+    model_name_or_path: str, config: PretrainedConfig
+) -> PreTrainedModel:
+    ## skip vision tower instantiation
+    if model_name_or_path is None:
+        return None
+    vision_tower_arch = None
+    if config.resume_path and "radio" not in model_name_or_path:
+        assert os.path.exists(
+            model_name_or_path
+        ), f"Resume vision tower path {model_name_or_path} does not exist!"
+        vision_tower_cfg = AutoConfig.from_pretrained(
+            model_name_or_path, trust_remote_code=True
+        )
+        vision_tower_arch = vision_tower_cfg.architectures[0].lower()
+    vision_tower_name = (
+        vision_tower_arch if vision_tower_arch is not None else model_name_or_path
+    )
+    use_s2 = getattr(config, "s2", False)
+    if "intern" in vision_tower_name.lower():
+        if hasattr(config, "drop_path_rate"):
+            vision_tower = InternVisionTower(
+                model_name_or_path, config=config, drop_path_rate=config.drop_path_rate
+            )
+        else:
+            vision_tower = InternVisionTower(
+                model_name_or_path, config=config, drop_path_rate=0.0
+            )
+    elif "clip" in vision_tower_name:
+        if use_s2:
+            vision_tower = CLIPVisionTowerS2(model_name_or_path, config)
+        else:
+            vision_tower = CLIPVisionTower(model_name_or_path, config)
+    elif "siglip" in vision_tower_name:
+        if use_s2:
+            vision_tower = SiglipVisionTowerS2(model_name_or_path, config)
+        else:
+            vision_tower = SiglipVisionTower(model_name_or_path, config)
+    else:
+        raise ValueError(f"Unknown vision tower: {model_name_or_path}")
+    config.mm_hidden_size = (
+        vision_tower.config.hidden_size if not use_s2 else vision_tower.hidden_size
+    )
+    return vision_tower
+def has_tokenizer(repo_id_or_path: str) -> bool:
+    # Check if the tokenizer is in a local directory
+    if osp.exists(osp.join(repo_id_or_path, "tokenizer_config.json")):
+        return True
+    # Check if the tokenizer is in a Hugging Face Hub repo
+    try:
+        return repo_exists(repo_id_or_path) and file_exists(
+            repo_id_or_path, "tokenizer_config.json"
+        )
+    except HFValidationError:
+        return False
+def context_length_extension(config):
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    model_max_length = getattr(config, "model_max_length", None)
+    if orig_ctx_len and model_max_length > orig_ctx_len:
+        print(f"Scaling RoPE from {orig_ctx_len} to {model_max_length}")
+        scaling_factor = float(math.ceil(model_max_length / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    return config
+def build_llm_and_tokenizer(
+    model_name_or_path: str,
+    config: PretrainedConfig,
+    attn_implementation=None,
+    model_max_length=None,
+    *args,
+    **kwargs,
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    llm_cfg = AutoConfig.from_pretrained(model_name_or_path)
+    llm_cfg._attn_implementation = attn_implementation
+    llm_cfg.model_max_length = model_max_length
+    if model_max_length is not None:
+        context_length_extension(llm_cfg)
+    llm = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        config=llm_cfg,
+        torch_dtype=eval(config.model_dtype),
+        *args,
+        **kwargs,
+    )
+    # Locate the tokenizer.
+    llm_path = model_name_or_path
+    if not has_tokenizer(llm_path):
+        llm_path = osp.join(llm_path, "llm")
+    if not has_tokenizer(llm_path):
+        raise ValueError(f"Cannot find tokenizer in {llm_path}.")
+    # TODO(ligeng): use LLM class to judge to better compability.
+    try:
+        llm_arch = getattr(llm_cfg, "architectures")[0].lower()
+    except BaseException:
+        warnings.warn(
+            f'Cannot find LLM architecture, please check the "config.json" under "{llm_path}".'
+        )
+    if "mpt" in llm_arch:
+        tokenizer = AutoTokenizer.from_pretrained(
+            llm_path,
+            model_max_length=llm_cfg.model_max_length,
+            padding_side="right",
+        )
+    elif "yi" in llm_path or (
+        getattr(llm_cfg, "num_hidden_layers", -1) == 60
+        and getattr(llm_cfg, "num_attention_heads", -1) == 56
+    ):
+        tokenizer = AutoTokenizer.from_pretrained(
+            llm_path,
+            model_max_length=llm_cfg.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            llm_path,
+            model_max_length=llm_cfg.model_max_length,
+            padding_side="right",
+            use_fast=False,
+            legacy=False,
+        )
+    # TODO(ligeng): is this necessary for llava?
+    config.hidden_size = llm.config.hidden_size
+    return llm, tokenizer
+def get_model_config(config):
+    default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
+    if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
+        root_path = config._name_or_path
+    else:
+        root_path = config.resume_path
+    # download from huggingface
+    if root_path is not None and not osp.exists(root_path):
+        try:
+            valid_hf_repo = repo_exists(root_path)
+        except HFValidationError as e:
+            valid_hf_repo = False
+        if valid_hf_repo:
+            root_path = snapshot_download(root_path)
+    return_list = []
+    for key in default_keys:
+        cfg = getattr(config, key, None)
+        if isinstance(cfg, dict):
+            try:
+                return_list.append(os.path.join(root_path, key[:-4]))
+            except:
+                raise ValueError(f"Cannot find resume path in config for {key}!")
+        elif isinstance(cfg, PretrainedConfig):
+            return_list.append(os.path.join(root_path, key[:-4]))
+        elif isinstance(cfg, str):
+            return_list.append(cfg)
+    return return_list
+def is_mm_model(model_path):
+    """
+    Check if the model at the given path is a visual language model.
+    Args:
+        model_path (str): The path to the model.
+    Returns:
+        bool: True if the model is an MM model, False otherwise.
+    """
+    config = AutoConfig.from_pretrained(model_path)
+    architectures = config.architectures
+    for architecture in architectures:
+        if "llava" in architecture.lower():
+            return True
+    return False
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if "llava" in config and "llava" not in cfg.model_type:
+        assert cfg.model_type == "llama"
+        print(
+            "You are using newer LLaVA code base, while the checkpoint of v0 is from older code base."
+        )
+        print(
+            "You must upgrade the checkpoint to the new code base (this can be done automatically)."
+        )
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = "LlavaLlamaForCausalLM"
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+def get_pg_manager():
+    return None
+# TODO decide whether should we use metaclass
+class LlavaMetaModel(ABC):
+    def init_vlm(self, config: PreTrainedModel = None, *args, **kwargs):
+        # TODO(ligeng): figure out how from_config and from_pretrained works in HF implementation.
+        if (
+            hasattr(self, "llm")
+            or hasattr(self, "vision_tower")
+            or hasattr(self, "mm_projector")
+        ):
+            # already initialized, skipped
+            return
+        model_dtype = getattr(config, "model_dtype", "torch.float16")
+        if not hasattr(config, "model_dtype"):
+            warnings.warn(
+                "model_dtype not found in config, defaulting to torch.float16."
+            )
+            config.model_dtype = model_dtype
+        cfgs = get_model_config(config)
+        if len(cfgs) == 3:
+            llm_cfg, vision_tower_cfg, mm_projector_cfg = cfgs
+        else:
+            raise ValueError(
+                "`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config."
+            )
+        # print("Before init in Config")
+        # if hasattr(config, "deepspeed") and "mics" in config.deepspeed:
+        #     print("Using MiCS_Init")
+        #     import deepspeed
+        #     with deepspeed.zero.MiCS_Init():
+        #         self.llm, self.tokenizer = build_llm_and_tokenizer(llm_cfg, config, *args, **kwargs)
+        #         self.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        #         self.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        # else:
+        self.llm, self.tokenizer = build_llm_and_tokenizer(
+            llm_cfg, config, *args, **kwargs
+        )
+        self.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        self.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        self.post_config()
+        self.is_loaded = True
+        assert (
+            self.llm is not None
+            or self.vision_tower is not None
+            or self.mm_projector is not None
+        ), "At least one of the components must be instantiated."
+    @classmethod
+    def load_from_config(cls, model_path_or_config, *args, **kwargs):
+        pass
+    ## FIXME we will use this function to load model in the future
+    @classmethod
+    def load_pretrained(cls, model_path_or_config, *args, **kwargs):
+        kwargs.pop("config", None)
+        if isinstance(model_path_or_config, str):
+            config = AutoConfig.from_pretrained(model_path_or_config)
+        elif isinstance(model_path_or_config, LlavaConfig):
+            config = model_path_or_config
+        else:
+            raise NotImplementedError(
+                f"wrong type, {type(model_path_or_config)} \
+                                      {isinstance(model_path_or_config, LlavaConfig)}"
+            )
+        model_dtype = getattr(config, "model_dtype", "torch.float16")
+        if not hasattr(config, "model_dtype"):
+            warnings.warn(
+                "model_dtype not found in config, defaulting to torch.float16."
+            )
+            config.model_dtype = model_dtype
+        cfgs = get_model_config(config)
+        if len(cfgs) == 3:
+            llm_cfg, vision_tower_cfg, mm_projector_cfg = cfgs
+        else:
+            raise ValueError(
+                "`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config."
+            )
+        # print(llm_cfg, vision_tower_cfg, mm_projector_cfg); input("DEBUG load_pretrained")
+        init_context = [
+            no_init_weights(_enable=True),
+        ]
+        # print("Before Init Context")
+        # if hasattr(config, "deepspeed") and "mics" in config.deepspeed:
+        #     print("Using MiCS_Init")
+        #     import deepspeed
+        #     init_context.append(deepspeed.zero.MiCS_Init(config_dict_or_path=config.deepspeed))
+        with ContextManagers(init_context):
+            vlm = cls(config, *args, **kwargs)
+        # print(llm_cfg, vision_tower_cfg, mm_projector_cfg); input("DEBUG load_pretrained finish")
+        if (
+            hasattr(vlm, "llm")
+            or hasattr(vlm, "vision_tower")
+            or hasattr(vlm, "mm_projector")
+        ):
+            if vlm.is_loaded:
+                return vlm
+        vlm.llm, vlm.tokenizer = build_llm_and_tokenizer(
+            llm_cfg, config, *args, **kwargs
+        )
+        vlm.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        vlm.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        self.post_config()
+        self.is_loaded = True
+        # FIXME(ligeng, yunhao): llm should never be none here.
+        assert (
+            vlm.llm is not None
+            or vlm.vision_tower is not None
+            or vlm.mm_projector is not None
+        ), "At least one of the components must be instantiated."
+        return vlm
+    ## FIXME we will use this function to save the model in the future
+    def save_pretrained(self, output_dir, state_dict=None):
+        if state_dict is None:
+            # other wise fetch from deepspeed
+            # state_dict = accelerator.get_state_dict(is_deepspeed_enabled)
+            state_dict = self.state_dict()
+        if getattr(self, "tokenizer", None):
+            self.tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        if self.get_llm():
+            print(f"saving llm to {osp.join(output_dir, 'llm')}")
+            self.llm.config._name_or_path = osp.join(output_dir, "llm")
+            llm_state_dict = OrderedDict(
+                {k.split("llm.")[-1]: v for k, v in state_dict.items() if "llm" in k}
+            )
+            self.llm.save_pretrained(
+                os.path.join(output_dir, "llm"), state_dict=llm_state_dict
+            )
+            self.config.llm_cfg = self.llm.config
+        if self.get_vision_tower():
+            print(f"saving vision_tower to {osp.join(output_dir, 'vision_tower')}")
+            self.vision_tower.config._name_or_path = osp.join(
+                output_dir, "vision_tower"
+            )
+            vision_tower_state_dict = OrderedDict(
+                {
+                    k.split("vision_tower.vision_tower.")[-1]: v
+                    for k, v in state_dict.items()
+                    if "vision_tower" in k
+                }
+            )
+            self.vision_tower.vision_tower.save_pretrained(
+                os.path.join(output_dir, "vision_tower"),
+                state_dict=vision_tower_state_dict,
+            )
+            self.vision_tower.image_processor.save_pretrained(
+                os.path.join(output_dir, "vision_tower")
+            )
+            self.config.vision_tower_cfg = self.vision_tower.config
+            if hasattr(self.config.vision_tower_cfg, "auto_map"):
+                if "radio" not in self.get_vision_tower().__class__.__name__.lower():
+                    delattr(self.config.vision_tower_cfg, "auto_map")
+        if self.get_mm_projector():
+            print(f"saving mm_projector to {osp.join(output_dir, 'mm_projector')}")
+            self.mm_projector.config._name_or_path = osp.join(
+                output_dir, "mm_projector"
+            )
+            mm_projector_state_dict = OrderedDict(
+                {
+                    k.split("mm_projector.")[-1]: v
+                    for k, v in state_dict.items()
+                    if "mm_projector" in k
+                }
+            )
+            self.mm_projector.save_pretrained(
+                os.path.join(output_dir, "mm_projector"),
+                state_dict=mm_projector_state_dict,
+            )
+            self.config.mm_projector_cfg = self.mm_projector.config
+        ## update and save top-level config
+        self.config._name_or_path = output_dir
+        self.config.architectures = [self.__class__.__name__]
+        self.config.save_pretrained(output_dir)
+    def get_llm(self):
+        llm = getattr(self, "llm", None)
+        if type(llm) is list:
+            llm = llm[0]
+        return llm
+    def get_lm_head(self):
+        lm_head = getattr(self.get_llm(), "lm_head", None)
+        return lm_head
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_mm_projector(self):
+        mm_projector = getattr(self, "mm_projector", None)
+        if type(mm_projector) is list:
+            mm_projector = mm_projector[0]
+        return mm_projector
+    def post_config(self):
+        self.training = self.get_llm().training
+        ## configuration
+        if getattr(self.config, "llm_cfg", None) is None:
+            self.config.llm_cfg = self.llm.config
+        if getattr(self.config, "vision_tower_cfg", None) is None:
+            self.config.vision_tower_cfg = self.vision_tower.config
+        if getattr(self.config, "mm_projector_cfg", None) is None:
+            self.config.mm_projector_cfg = self.mm_projector.config
+    def freezed_module_patch(self):
+        """
+        Huggingface will call model.train() at each training_step. To ensure the expected behaviors for modules like dropout, batchnorm, etc., we need to call model.eval() for the freezed modules.
+        """
+        if self.training:
+            if self.get_llm() and not getattr(
+                self.config, "tune_language_model", False
+            ):
+                pass
+                # logging.warning("Caution: Your LLM is currently in training mode, ensuring accurate gradient computation. Please be vigilant, particularly regarding BatchNorm and Dropout operations.")
+            if self.get_vision_tower() and not getattr(
+                self.config, "tune_vision_tower", False
+            ):
+                self.get_vision_tower().eval()
+            if self.get_mm_projector() and not getattr(
+                self.config, "tune_mm_projector", False
+            ):
+                self.get_mm_projector().eval()
+    def encode_images(self, images):
+        image_features = self.get_vision_tower()(images)
+        image_features = self.get_mm_projector()(image_features)
+        return image_features
+    ## @yunhao: is there a better way to handle function call and attributes for llm?
+    ## support beam search
+    def _temporary_reorder_cache(self, past_key_values, sorted_idx):
+        return self.get_llm()._temporary_reorder_cache(past_key_values, sorted_idx)
+    def get_input_embeddings(self):
+        return self.get_llm().get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.get_llm().get_output_embeddings()
+    def resize_token_embeddings(self, embed_size):
+        self.get_llm().resize_token_embeddings(embed_size)
+class LlavaMetaForCausalLM(ABC):
+    """This class is originally implemented by the LLaVA team and
+    modified by Haotian Tang and Jason Lu based on Ji Lin's implementation
+    to support multiple images and input packing."""
+    ## TODO move the forward function here if there is no need to override it
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images
+    ):
+        # Handle sequence parallelism
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is None:
+            sp_degree = -1
+            sp_rank = -1
+        else:
+            sp_degree = PROCESS_GROUP_MANAGER.sp_degree
+            sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+        vision_tower = self.get_vision_tower()
+        if (
+            vision_tower is None
+            or images is None
+            or (input_ids.shape[1] == 1 and PROCESS_GROUP_MANAGER is None)
+        ):
+            if (
+                past_key_values is not None
+                and vision_tower is not None
+                and images is not None
+                and input_ids.shape[1] == 1
+            ):
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat(
+                    (
+                        attention_mask,
+                        torch.ones(
+                            (
+                                attention_mask.shape[0],
+                                target_shape - attention_mask.shape[1],
+                            ),
+                            dtype=attention_mask.dtype,
+                            device=attention_mask.device,
+                        ),
+                    ),
+                    dim=1,
+                )
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+            )
+        # handle different image dtypes for packing
+        if type(images) is list:
+            images = torch.cat(images, dim=0)
+        elif images.ndim == 5:  # batch_size x seq_len x image_channels
+            images = images.flatten(0, 1)
+        image_features = self.encode_images(images).to(self.device)
+        # Note (kentang-mit@): image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "turn_mm_projector", False) and getattr(
+            self.config, "mm_use_im_start_end", False
+        ):
+            raise NotImplementedError
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask
+        input_ids_copy = input_ids.clone()
+        # kentang-mit@: Otherwise tokenizer out of bounds. Embeddings of image tokens will not be used.
+        input_ids_copy[input_ids_copy == IMAGE_TOKEN_INDEX] = 0
+        input_embeds = self.llm.model.embed_tokens(input_ids_copy)
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        input_embeds_1 = [
+            cur_input_embeds[cur_attention_mask]
+            for cur_input_embeds, cur_attention_mask in zip(
+                input_embeds, attention_mask
+            )
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        # kentang-mit@: If some part of the model is executed in the loop, the the loop length needs to be a constant.
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            cur_input_ids = input_ids[batch_idx]
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[0]
+                cur_input_embeds_1 = input_embeds_1[batch_idx]
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                # kenang-mit@: we do not have placeholdr image for text-only data now.
+                continue
+            cur_input_embeds = input_embeds_1[batch_idx]
+            image_token_indices = (
+                [-1]
+                + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            cur_input_embeds_no_im = []
+            for i in range(len(image_token_indices) - 1):
+                if (
+                    sp_degree > 1 and i == 0 and sp_rank != 0
+                ):  # Handle sequence parallelism
+                    cur_input_ids_noim.append(cur_input_ids[0:0])
+                    cur_labels_noim.append(cur_labels[0:0])
+                    cur_input_embeds_no_im.append(cur_input_embeds[0:0])
+                    continue
+                cur_input_ids_noim.append(
+                    cur_input_ids[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+                cur_labels_noim.append(
+                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]
+                )
+                cur_input_embeds_no_im.append(
+                    cur_input_embeds[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(
+            self.llm.config, "tokenizer_model_max_length", None
+        )
+        if tokenizer_model_max_length is not None:
+            if any(len(x) > tokenizer_model_max_length for x in new_input_embeds):
+                warnings.warn("Inputs truncated!")
+            new_input_embeds = [
+                x[:tokenizer_model_max_length] for x in new_input_embeds
+            ]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        # max_len = tokenizer_model_max_length
+        # print("Warning: using max_len as tokenizer_model_max_length")
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
+        )
+        for i, (cur_new_embed, cur_new_labels) in enumerate(
+            zip(new_input_embeds, new_labels)
+        ):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.llm.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        # if sp_degree > 1:  # Handle sequence parallelism
+        #     if sp_rank not in self.global_seq_len:
+        #         self.global_seq_len[sp_rank] = position_ids.shape[-1]
+        #     else:
+        #         assert self.global_seq_len[sp_rank] == position_ids.shape[-1]
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        # We will not use packing here when sequence parallelism is enabled.
+        if PROCESS_GROUP_MANAGER is not None:
+            return (
+                None,
+                _position_ids,
+                attention_mask,
+                past_key_values,
+                new_input_embeds,
+                new_labels,
+            )
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+        )
+    def repack_multimodal_data(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        inputs_embeds,
+        labels,
+    ):
+        # Handle sequence parallelism
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        # if PROCESS_GROUP_MANAGER is None:
+        #     sp_degree = -1
+        #     sp_rank = -1
+        # else:
+        #     sp_degree = PROCESS_GROUP_MANAGER.sp_degree
+        #     sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+        # We will not use packing here when sequence parallelism is enabled.
+        # However, we do resharding here to ensure the sequence length is the same across all ranks.
+        if PROCESS_GROUP_MANAGER is not None:
+            sp_degree = PROCESS_GROUP_MANAGER.sp_degree
+            sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+            sp_group = PROCESS_GROUP_MANAGER.ulysses_pg
+            bs, shard_seqlen = position_ids.shape
+            ulysess_seq_len = [
+                torch.zeros(1, dtype=torch.int64, device=position_ids.device)
+                for _ in range(sp_degree)
+            ]
+            dist.all_gather(
+                ulysess_seq_len,
+                torch.tensor(shard_seqlen, device=position_ids.device),
+                group=sp_group,
+            )
+            # global_seq_len = torch.sum(torch.cat(ulysess_seq_len, dim=0)).item()
+            # Gather attention_mask and reshard it evenly
+            attention_mask_list = [
+                torch.zeros(
+                    (bs, ulysess_seq_len[i]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                for i in range(sp_degree)
+            ]
+            dist.all_gather(attention_mask_list, attention_mask, group=sp_group)
+            effective_seqlen_list = [
+                attention_mask_list[i].sum(dim=-1) for i in range(sp_degree)
+            ]
+            effective_seqlen = torch.stack(effective_seqlen_list, dim=-1)
+            effective_seqlen_batch_list = torch.unbind(effective_seqlen, dim=0)
+            global_attention_mask_list = []
+            for i in range(bs):
+                global_attention_mask_batch_list = []
+                for j in range(sp_degree):
+                    global_attention_mask_batch_list.append(
+                        attention_mask_list[j][i, : effective_seqlen_batch_list[i][j]]
+                    )
+                global_attention_mask_list.append(
+                    torch.cat(global_attention_mask_batch_list, dim=0)
+                )
+            global_attention_mask = torch.nn.utils.rnn.pad_sequence(
+                global_attention_mask_list, batch_first=True, padding_value=False
+            )
+            # Hyperparameters for sequence parallelism resharding
+            global_seq_len = global_attention_mask.shape[-1]
+            seq_len_sharded = global_seq_len // sp_degree
+            start_idx_reshard = seq_len_sharded * sp_rank
+            end_idx_reshard = (
+                start_idx_reshard + seq_len_sharded
+                if sp_rank < sp_degree - 1
+                else global_seq_len
+            )
+            # if sp_rank == 0:
+            #     start_idx = 0
+            # else:
+            #     start_idx = torch.sum(torch.cat(ulysess_seq_len[:sp_rank], dim=0)).item()
+            new_attention_mask = torch.narrow(
+                global_attention_mask,
+                1,
+                start_idx_reshard,
+                end_idx_reshard - start_idx_reshard,
+            )
+            # Gather position_ids and reshard it evenly
+            position_ids_list = [
+                torch.zeros(
+                    (bs, ulysess_seq_len[i]),
+                    dtype=position_ids.dtype,
+                    device=position_ids.device,
+                )
+                for i in range(sp_degree)
+            ]
+            dist.all_gather(position_ids_list, position_ids, group=sp_group)
+            global_position_ids_list = []
+            for i in range(bs):
+                global_position_ids_batch_list = []
+                for j in range(sp_degree):
+                    global_position_ids_batch_list.append(
+                        position_ids_list[j][i, : effective_seqlen_batch_list[i][j]]
+                    )
+                global_position_ids_list.append(
+                    torch.cat(global_position_ids_batch_list, dim=0)
+                )
+            global_position_ids = torch.nn.utils.rnn.pad_sequence(
+                global_position_ids_list, batch_first=True, padding_value=-1
+            )
+            new_position_ids = torch.narrow(
+                global_position_ids,
+                1,
+                start_idx_reshard,
+                end_idx_reshard - start_idx_reshard,
+            )
+            # Gather labels and reshard it evenly
+            labels_list = [
+                torch.zeros(
+                    (bs, ulysess_seq_len[i]), dtype=labels.dtype, device=labels.device
+                )
+                for i in range(sp_degree)
+            ]
+            dist.all_gather(labels_list, labels, group=sp_group)
+            global_labels_list = []
+            for i in range(bs):
+                global_labels_batch_list = []
+                for j in range(sp_degree):
+                    global_labels_batch_list.append(
+                        labels_list[j][i, : effective_seqlen_batch_list[i][j]]
+                    )
+                global_labels_list.append(torch.cat(global_labels_batch_list, dim=0))
+            global_labels = torch.nn.utils.rnn.pad_sequence(
+                global_labels_list, batch_first=True, padding_value=IGNORE_INDEX
+            )
+            new_labels = torch.narrow(
+                global_labels, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+            )
+            # Gather inputs_embeds and reshard it evenly
+            # TODO: Fix the non-enough images.
+            # inputs_embeds_list = [torch.zeros((bs, ulysess_seq_len[i], inputs_embeds.shape[-1]), dtype=inputs_embeds.dtype, device=inputs_embeds.device, requires_grad=True) for i in range(sp_degree)]
+            # dist.all_gather(inputs_embeds_list, inputs_embeds, group=sp_group)
+            # global_inputs_embeds_list = []
+            # for i in range(bs):
+            #     global_inputs_embeds_batch_list = []
+            #     for j in range(sp_degree):
+            #         global_inputs_embeds_batch_list.append(inputs_embeds_list[j][i, :effective_seqlen_batch_list[i][j]])
+            #     global_inputs_embeds_list.append(torch.cat(global_inputs_embeds_batch_list, dim=0))
+            # global_inputs_embeds = torch.nn.utils.rnn.pad_sequence(global_inputs_embeds_list, batch_first=True, padding_value=0)
+            # new_inputs_embeds = torch.narrow(global_inputs_embeds, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard)
+            # Gather all hidden states and flaten them
+            ulysess_seq_len_cat = torch.cat(ulysess_seq_len, dim=0)
+            global_inputs_embeds_list = []
+            if sp_rank == 0:
+                original_start_id = 0
+                original_end_id = torch.sum(ulysess_seq_len_cat[: sp_rank + 1]).item()
+            elif sp_rank == sp_degree - 1:
+                original_start_id = torch.sum(ulysess_seq_len_cat[:sp_rank]).item()
+                original_end_id = torch.sum(ulysess_seq_len_cat[: sp_rank + 1]).item()
+            else:
+                original_start_id = torch.sum(ulysess_seq_len_cat[:sp_rank]).item()
+                original_end_id = torch.sum(ulysess_seq_len_cat[: sp_rank + 1]).item()
+            all_inputs_embeds = torch.zeros(
+                bs,
+                torch.sum(ulysess_seq_len_cat),
+                inputs_embeds.shape[-1],
+                dtype=inputs_embeds.dtype,
+                device=inputs_embeds.device,
+            ).contiguous()
+            all_inputs_embeds[:, original_start_id:original_end_id, :] += inputs_embeds
+            dist.barrier(group=sp_group)
+            dist.all_reduce(all_inputs_embeds, group=sp_group)
+            dist.barrier(group=sp_group)
+            for i in range(bs):
+                global_inputs_embeds_batch_list = []
+                for j in range(sp_degree):
+                    prev_len = torch.sum(ulysess_seq_len_cat[:j]).item() if j > 0 else 0
+                    start_id = prev_len
+                    end_id = prev_len + effective_seqlen_batch_list[i][j]
+                    global_inputs_embeds_batch_list.append(
+                        all_inputs_embeds[i, start_id:end_id]
+                    )
+                global_inputs_embeds_list.append(
+                    torch.cat(global_inputs_embeds_batch_list, dim=0)
+                )
+            global_inputs_embeds = torch.nn.utils.rnn.pad_sequence(
+                global_inputs_embeds_list, batch_first=True, padding_value=0
+            )
+            new_inputs_embeds = torch.narrow(
+                global_inputs_embeds,
+                1,
+                start_idx_reshard,
+                end_idx_reshard - start_idx_reshard,
+            )
+            return (
+                None,
+                new_position_ids,
+                new_attention_mask,
+                past_key_values,
+                new_inputs_embeds,
+                new_labels,
+                None,  # sorted_seqlens_in_batch set as None for sequence parallelism
+            )
+        # kentang-mit@: reorder and repack (reduce computation overhead)
+        # requires transformers replacement.
+        new_inputs_embeds = []
+        new_position_ids = []
+        new_labels = []
+        seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+        sorted_seqlens_in_batch, sorted_idx = torch.sort(
+            seqlens_in_batch, descending=True
+        )
+        max_seqlen = inputs_embeds.shape[1]
+        cur_inputs_embeds = []
+        cur_position_ids = []
+        cur_labels = []
+        cur_batch_len = 0
+        for i in range(len(sorted_seqlens_in_batch)):
+            cur_seqlen = sorted_seqlens_in_batch[i].item()
+            if cur_seqlen + cur_batch_len <= max_seqlen:
+                cur_batch_len += cur_seqlen
+                # each item: num_tokens x num_channels
+                # remove padding on-the-fly
+                cur_inputs_embeds.append(
+                    inputs_embeds[sorted_idx[i]][attention_mask[sorted_idx[i]]]
+                )
+                cur_position_ids.append(
+                    torch.arange(
+                        cur_inputs_embeds[-1].shape[0],
+                        device=cur_inputs_embeds[-1].device,
+                    )
+                )
+                # each item: num_tokens
+                # remove padding on-the-fly
+                cur_labels.append(labels[sorted_idx[i]][attention_mask[sorted_idx[i]]])
+            else:
+                new_inputs_embeds.append(torch.cat(cur_inputs_embeds, 0))
+                new_position_ids.append(torch.cat(cur_position_ids, 0))
+                new_labels.append(torch.cat(cur_labels, 0))
+                # The current batch is too long. We will start a new batch.
+                cur_batch_len = cur_seqlen
+                cur_inputs_embeds = [
+                    inputs_embeds[sorted_idx[i]][attention_mask[sorted_idx[i]]]
+                ]
+                cur_position_ids = [
+                    torch.arange(
+                        cur_inputs_embeds[-1].shape[0],
+                        device=cur_inputs_embeds[-1].device,
+                    )
+                ]
+                cur_labels = [labels[sorted_idx[i]][attention_mask[sorted_idx[i]]]]
+            # Mask the first token in the labels for every sample
+            # cur_labels[-1][0] = IGNORE_INDEX
+        if len(cur_inputs_embeds):
+            new_inputs_embeds.append(torch.cat(cur_inputs_embeds, 0))
+            new_position_ids.append(torch.cat(cur_position_ids, 0))
+            new_labels.append(torch.cat(cur_labels, 0))
+        new_inputs_embeds = torch.nn.utils.rnn.pad_sequence(
+            new_inputs_embeds, batch_first=True, padding_value=self.llm.pad_token_id
+        )
+        new_position_ids = torch.nn.utils.rnn.pad_sequence(
+            new_position_ids, batch_first=True, padding_value=-1
+        )
+        new_labels = torch.nn.utils.rnn.pad_sequence(
+            new_labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+        ## yunhao: it's currently a workaround to avoid errors for seq_len < 100
+        new_attention_mask = new_position_ids.ne(-1)
+        # sanity check
+        assert new_attention_mask.sum() == attention_mask.sum()
+        # Handle sequence parallelism: Calculate the position ids for sequence parallelism
+        # NOTE: This implementation only works for [<bos>, <img>, ..., <img>, <caption>] pattern
+        # if sp_degree > 1 and sp_rank > 0:
+        #     cur_len = new_position_ids.shape[-1]
+        #     if sp_rank < sp_degree - 1:  # Intermediate ranks
+        #         offset = cur_len * sp_rank + 1
+        #         new_position_ids = new_position_ids + offset
+        #     elif sp_rank == sp_degree - 1:  # The last rank
+        #         assert new_labels[0, -1] != IGNORE_INDEX, "The first sequence should be longest one."
+        #         last_img_token_index = torch.where(new_labels[0] == IGNORE_INDEX)[0][-1]
+        #         # print(f"last_img_token_index, {last_img_token_index}")
+        #         # if sp_degree == 2: # Handle SP=2, because of bos_token
+        #         #     offset = last_img_token_index + 3
+        #         # else:
+        #         #     offset = (last_img_token_index + 2) * sp_rank + 1
+        #         offset = (last_img_token_index + 1) * sp_rank + 1
+        #         offset_mask = new_position_ids != -1
+        #         new_position_ids[offset_mask] += offset
+        #     else:
+        #         raise ValueError(f"sp_rank {sp_rank} is out of range {sp_degree}")
+        return (
+            None,
+            new_position_ids,
+            new_attention_mask,
+            past_key_values,
+            new_inputs_embeds,
+            new_labels,
+            sorted_seqlens_in_batch,
+        )
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            ## TODO yunhao: handle cases for <im_st> <im_end>
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
+                )
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[
+                        -num_new_tokens:
+                    ]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                    )
+        elif model_args.mm_use_im_patch_token:
+            if model_args.mm_projector:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

llava_llama.py ADDED Viewed

	@@ -0,0 +1,1193 @@

+import inspect
+# from .builder import build_llm_and_tokenizer, build_mm_projector, build_vision_tower
+import os
+import os.path as osp
+import shutil
+import warnings
+from typing import List, Optional, Tuple, Union
+# from .llava_llama import LlavaLlamaModel
+# from llava.model import *
+# from llava.model.utils import is_mm_model
+import torch
+import torch.nn as nn
+from huggingface_hub import repo_exists, snapshot_download
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+# from llava.model.multimodal_encoder.vision_encoder import (VisionTower,
+#                                                            VisionTowerS2)
+from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
+                          AutoTokenizer, BitsAndBytesConfig, GenerationConfig,
+                          LlamaConfig, LlamaForCausalLM, PretrainedConfig,
+                          PreTrainedModel, SiglipImageProcessor,
+                          SiglipVisionModel)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_llava import LlavaConfig  # , LlavaLlamaConfig
+# from .llava_arch import LlavaMetaForCausalLM, LlavaMetaModel
+from .utils import get_model_config
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+def is_deepspeed_zero3_enabled():
+    return None
+import torch
+import torch.nn as nn
+from transformers import (AutoConfig, AutoModel, PretrainedConfig,
+                          PreTrainedModel)
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class DownSampleBlock(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+    def flat_square(self, x):
+        n, w, h, c = x.size()
+        if w % 2 == 1:
+            x = torch.concat(
+                [x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1
+            ).contiguous()
+            n, w, h, c = x.size()
+        if h % 2 == 1:
+            x = torch.concat(
+                [x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2
+            ).contiguous()
+            n, w, h, c = x.size()
+        x = x.view(n, w, int(h / 2), int(c * 2))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+        return x
+class MultimodalProjectorConfig(PretrainedConfig):
+    model_type = "v2l_projector"
+    def __init__(self, mm_projector_type: str = None, **kwargs):
+        super().__init__()
+        self.mm_projector_type = mm_projector_type
+class MultimodalProjector(PreTrainedModel):
+    config_class = MultimodalProjectorConfig
+    def __init__(
+        self, mm_projector_cfg: MultimodalProjectorConfig, config: PretrainedConfig
+    ):
+        super().__init__(mm_projector_cfg)
+        mm_projector_type = mm_projector_cfg.mm_projector_type
+        if mm_projector_type == "identity":
+            self.layers = IdentityMap()
+        elif mm_projector_type == "linear":
+            self.layers = nn.Linear(config.mm_hidden_size, config.hidden_size)
+        elif mm_projector_type == "mlp_downsample":
+            self.layers = nn.Sequential(
+                DownSampleBlock(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", mm_projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU())
+                    modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+                self.layers = nn.Sequential(*modules)
+            else:
+                raise ValueError(f"Unknown projector type: {mm_projector_type}")
+    def forward(self, x, *args, **kwargs):
+        return self.layers(x)
+def build_mm_projector(
+    model_type_or_path: str, config: PretrainedConfig
+) -> PreTrainedModel:
+    if model_type_or_path is None:
+        return None
+    ## load from pretrained model
+    if config.resume_path:
+        assert os.path.exists(
+            model_type_or_path
+        ), f"Resume mm projector path {model_type_or_path} does not exist!"
+        return MultimodalProjector.from_pretrained(
+            model_type_or_path, config, torch_dtype=eval(config.model_dtype)
+        )
+    ## build from scratch
+    else:
+        mm_projector_cfg = MultimodalProjectorConfig(model_type_or_path)
+        mm_projector = MultimodalProjector(mm_projector_cfg, config).to(
+            eval(config.model_dtype)
+        )
+        return mm_projector
+class VisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = getattr(args, "mm_vision_select_layer", -2)
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.cfg_only = None
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def _maybe_resize_pos_embeds(
+        self,
+        model: PreTrainedModel,
+        image_processor,
+        resolution: int = -1,
+        interpolate_mode: str = "linear",
+    ):
+        if resolution in [model.config.image_size, -1]:
+            return
+        print(
+            f"Resizing vision model's position embeddings to support higher vision resolution: from {model.config.image_size} to {resolution} ..."
+        )
+        embeddings = model.vision_model.embeddings
+        patch_size = embeddings.patch_size
+        num_new_tokens = int((resolution // patch_size) ** 2)
+        old_embeddings = embeddings.position_embedding
+        match interpolate_mode:
+            case "linear":
+                ## Step 1: Calculate the corresponding patch ID (pid) in the current resolution (M patches) based on the target resolution (N patches). Formula: pid = pid / N * M
+                ## Step 2:  Obtain new embeddings by interpolating between the embeddings of the two nearest calculated patch IDs. Formula: new_embeds = (pid - floor(pid)) * embeds[ceil(pid)] + (ceil(pid) - pid) * embeds[floor(pid)]
+                import torch
+                import torch.nn as nn
+                old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                new_embeddings = nn.Embedding(
+                    num_new_tokens,
+                    old_embedding_dim,
+                    dtype=old_embeddings.weight.dtype,
+                    device=old_embeddings.weight.device,
+                )
+                mapped_indices = (
+                    torch.arange(num_new_tokens).to(old_embeddings.weight.device)
+                    / (num_new_tokens - 1)
+                    * (old_num_tokens - 1)
+                )
+                floor_indices = torch.clamp(
+                    mapped_indices.floor().long(), min=0, max=old_num_tokens - 1
+                )
+                ceil_indices = torch.clamp(
+                    mapped_indices.ceil().long(), min=0, max=old_num_tokens - 1
+                )
+                if is_deepspeed_zero3_enabled():
+                    params = [old_embeddings.weight, new_embeddings.weight]
+                    with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                        interpolated_embeds = (mapped_indices - floor_indices)[
+                            :, None
+                        ] * old_embeddings.weight.data[ceil_indices, :] + (
+                            ceil_indices - mapped_indices
+                        )[
+                            :, None
+                        ] * old_embeddings.weight.data[
+                            floor_indices, :
+                        ]
+                else:
+                    interpolated_embeds = (mapped_indices - floor_indices)[
+                        :, None
+                    ] * old_embeddings.weight.data[ceil_indices, :] + (
+                        ceil_indices - mapped_indices
+                    )[
+                        :, None
+                    ] * old_embeddings.weight.data[
+                        floor_indices, :
+                    ]
+                new_embeddings.weight.data = interpolated_embeds
+            case _:
+                raise NotImplementedError
+        if hasattr(old_embeddings, "_hf_hook"):
+            hook = old_embeddings._hf_hook
+            # disable to inference
+            # add_hook_to_module(new_embeddings, hook)
+        new_embeddings.requires_grad_(old_embeddings.weight.requires_grad)
+        ## update vision encoder's configurations
+        model.config.image_size = resolution
+        if hasattr(image_processor, "crop_size"):
+            # CLIP vision tower
+            image_processor.crop_size = resolution
+        else:
+            # SIGLIP vision tower
+            assert hasattr(image_processor, "size")
+            image_processor.size = {"height": resolution, "width": resolution}
+        ## TODO define a '_reinitialize' method for VisionTower
+        embeddings.position_embedding = new_embeddings
+        embeddings.image_size = resolution
+        embeddings.num_patches = embeddings.num_positions = num_new_tokens
+        embeddings.position_ids = (
+            torch.arange(embeddings.num_positions)
+            .expand((1, -1))
+            .to(old_embeddings.weight.device)
+        )
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class SiglipVisionTower(VisionTower):
+    def __init__(
+        self, model_name_or_path: str, config: PretrainedConfig, state_dict=None
+    ):
+        super().__init__(model_name_or_path, config)
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            # TODO(ligeng): why pass config here leading to errors?
+            model_name_or_path,
+            torch_dtype=eval(config.model_dtype),
+            state_dict=state_dict,
+        )
+        self.is_loaded = True
+def build_vision_tower(
+    model_name_or_path: str, config: PretrainedConfig
+) -> PreTrainedModel:
+    ## skip vision tower instantiation
+    if model_name_or_path is None:
+        return None
+    vision_tower_arch = None
+    if config.resume_path and "radio" not in model_name_or_path:
+        assert os.path.exists(
+            model_name_or_path
+        ), f"Resume vision tower path {model_name_or_path} does not exist!"
+        vision_tower_cfg = AutoConfig.from_pretrained(
+            model_name_or_path, trust_remote_code=True
+        )
+        vision_tower_arch = vision_tower_cfg.architectures[0].lower()
+    vision_tower_name = (
+        vision_tower_arch if vision_tower_arch is not None else model_name_or_path
+    )
+    use_s2 = getattr(config, "s2", False)
+    if "siglip" in vision_tower_name:
+        if use_s2:
+            vision_tower = SiglipVisionTowerS2(model_name_or_path, config)
+        else:
+            vision_tower = SiglipVisionTower(model_name_or_path, config)
+    else:
+        raise ValueError(f"Unknown vision tower: {model_name_or_path}")
+    config.mm_hidden_size = (
+        vision_tower.config.hidden_size if not use_s2 else vision_tower.hidden_size
+    )
+    return vision_tower
+def has_tokenizer(repo_id_or_path: str) -> bool:
+    # Check if the tokenizer is in a local directory
+    if osp.exists(osp.join(repo_id_or_path, "tokenizer_config.json")):
+        return True
+    # Check if the tokenizer is in a Hugging Face Hub repo
+    try:
+        return repo_exists(repo_id_or_path) and file_exists(
+            repo_id_or_path, "tokenizer_config.json"
+        )
+    except HFValidationError:
+        return False
+def context_length_extension(config):
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    model_max_length = getattr(config, "model_max_length", None)
+    if orig_ctx_len and model_max_length > orig_ctx_len:
+        print(f"Scaling RoPE from {orig_ctx_len} to {model_max_length}")
+        scaling_factor = float(math.ceil(model_max_length / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    return config
+def build_llm_and_tokenizer(
+    model_name_or_path: str,
+    config: PretrainedConfig,
+    attn_implementation=None,
+    model_max_length=None,
+    *args,
+    **kwargs,
+):
+    llm_cfg = AutoConfig.from_pretrained(model_name_or_path)
+    llm_cfg._attn_implementation = attn_implementation
+    llm_cfg.model_max_length = model_max_length
+    if model_max_length is not None:
+        context_length_extension(llm_cfg)
+    llm = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        config=llm_cfg,
+        torch_dtype=eval(config.model_dtype),
+        *args,
+        **kwargs,
+    )
+    # Locate the tokenizer.
+    llm_path = model_name_or_path
+    if not has_tokenizer(llm_path):
+        llm_path = osp.join(llm_path, "llm")
+    if not has_tokenizer(llm_path):
+        raise ValueError(f"Cannot find tokenizer in {llm_path}.")
+    # TODO(ligeng): use LLM class to judge to better compability.
+    try:
+        llm_arch = getattr(llm_cfg, "architectures")[0].lower()
+    except BaseException:
+        warnings.warn(
+            f'Cannot find LLM architecture, please check the "config.json" under "{llm_path}".'
+        )
+    if "mpt" in llm_arch:
+        tokenizer = AutoTokenizer.from_pretrained(
+            llm_path,
+            model_max_length=llm_cfg.model_max_length,
+            padding_side="right",
+        )
+    elif "yi" in llm_path or (
+        getattr(llm_cfg, "num_hidden_layers", -1) == 60
+        and getattr(llm_cfg, "num_attention_heads", -1) == 56
+    ):
+        tokenizer = AutoTokenizer.from_pretrained(
+            llm_path,
+            model_max_length=llm_cfg.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            llm_path,
+            model_max_length=llm_cfg.model_max_length,
+            padding_side="right",
+            use_fast=False,
+            legacy=False,
+        )
+    # TODO(ligeng): is this necessary for llava?
+    config.hidden_size = llm.config.hidden_size
+    return llm, tokenizer
+def is_mm_model(model_path):
+    """
+    Check if the model at the given path is a visual language model.
+    Args:
+        model_path (str): The path to the model.
+    Returns:
+        bool: True if the model is an MM model, False otherwise.
+    """
+    config = AutoConfig.from_pretrained(model_path)
+    architectures = config.architectures
+    for architecture in architectures:
+        if "llava" in architecture.lower():
+            return True
+    return False
+def load_pretrained_model(
+    model_path,
+    model_name,
+    model_base=None,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="auto",
+    device="cuda",
+    **kwargs,
+):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs["device_map"] = {"": device}
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.float16
+        # kwargs["torch_dtype"] = torch.bfloat16
+    if is_mm_model(model_path):
+        # Load LLaVA model
+        ## TODO @yunhao: mind fixing lora
+        if "lora" in model_name.lower() and model_base is None:
+            warnings.warn(
+                "There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged."
+            )
+        if (
+            "lora" in model_name.lower() or "dora" in model_name.lower()
+        ) and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            print(lora_cfg_pretrained)
+            print("Loading LLaVA from base model...")
+            config = AutoConfig.from_pretrained(model_base)
+            prepare_config_for_eval(config, kwargs)
+            model = LlavaLlamaModel.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=config, **kwargs
+            )
+            tokenizer = model.tokenizer
+            token_num, tokem_dim = (
+                model.llm.lm_head.out_features,
+                model.llm.lm_head.in_features,
+            )
+            if model.llm.lm_head.weight.shape[0] != token_num:
+                model.llm.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+                model.llm.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+            print("Loading additional LLaVA weights...")
+            if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+                non_lora_trainables = torch.load(
+                    os.path.join(model_path, "non_lora_trainables.bin"),
+                    map_location="cpu",
+                )
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id, filename=filename, subfolder=subfolder
+                    )
+                    return torch.load(cache_file, map_location="cpu")
+                non_lora_trainables = load_from_hf(
+                    model_path, "non_lora_trainables.bin"
+                )
+            non_lora_trainables = {
+                (k[11:] if k.startswith("base_model.") else k): v
+                for k, v in non_lora_trainables.items()
+            }
+            if any(k.startswith("model.model.") for k in non_lora_trainables):
+                non_lora_trainables = {
+                    (k[6:] if k.startswith("model.") else k): v
+                    for k, v in non_lora_trainables.items()
+                }
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print("Loading LoRA weights...")
+            model = PeftModel.from_pretrained(model, model_path)
+            print("Merging LoRA weights...")
+            model = model.merge_and_unload()
+            print("Model is loaded...")
+        ## TODO @yunhao: mind fixing this
+        elif model_base is not None:
+            # this may be mm projector only
+            print("Loading LLaVA from base model...")
+            cfg_pretrained = AutoConfig.from_pretrained(
+                model_path, trust_remote_code=True
+            )
+            mm_config_wrapper(config, kwargs)
+            if "mpt" in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, "configuration_mpt.py")):
+                    shutil.copyfile(
+                        os.path.join(model_base, "configuration_mpt.py"),
+                        os.path.join(model_path, "configuration_mpt.py"),
+                    )
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                model = LlavaMPTForCausalLM.from_pretrained(
+                    model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_base, use_fast=False, legacy=False
+                )
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+                )
+        else:
+            config = AutoConfig.from_pretrained(model_path)
+            config.resume_path = model_path
+            prepare_config_for_eval(config, kwargs)
+            if "mpt" in model_name.lower():
+                model = LlavaMPTForCausalLM.from_pretrained(
+                    model_path, config=config, low_cpu_mem_usage=True, **kwargs
+                )
+            elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
+                model = LlavaMistralForCausalLM.from_pretrained(
+                    model_path, config=config, low_cpu_mem_usage=True, **kwargs
+                )
+            elif "gemma" in model_name.lower():
+                model = LlavaGemmaForCausalLM.from_pretrained(
+                    model_path, config=config, low_cpu_mem_usage=True, **kwargs
+                )
+            else:
+                # kentang-mit@: llama-2 model
+                # config._attn_implementation = "flash_attention_2"
+                model = LlavaLlamaModel(config=config, low_cpu_mem_usage=True, **kwargs)
+            tokenizer = model.tokenizer
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, **kwargs
+            )
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print("Convert to FP16...")
+            model.to(torch.float16)
+        else:
+            if "mpt" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_path, use_fast=False, legacy=False
+                )
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+    model.eval()
+    image_processor = None
+    if is_mm_model(model_path):
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(device=device, dtype=torch.float16)
+        # vision_tower.to(device=device, dtype=torch.bfloat16)
+        mm_projector = model.get_mm_projector()
+        mm_projector.to(device=device, dtype=torch.float16)
+        # mm_projector.to(device=device, dtype=torch.bfloat16)
+        image_processor = vision_tower.image_processor
+    if hasattr(model.llm.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len
+def parse_model_name_or_path(config: PretrainedConfig, model_name="llm", suffix="_cfg"):
+    target_model = f"{model_name}{suffix}"
+    target_cfg = getattr(config, target_model, None)
+    if isinstance(target_cfg, str):
+        return target_cfg
+    elif isinstance(target_cfg, dict):
+        return target_cfg["architectures"][0]
+    else:
+        raise ValueError(f"Invalid {target_model} configuration!")
+def prepare_config_for_eval(config: PretrainedConfig, kwargs: dict):
+    try:
+        # compatible with deprecated config convention
+        if getattr(config, "vision_tower_cfg", None) is None:
+            config.vision_tower_cfg = config.mm_vision_tower
+    except AttributeError:
+        raise ValueError(
+            f"Invalid configuration! Cannot find vision_tower in config:\n{config}"
+        )
+    config.model_dtype = kwargs.pop("torch_dtype").__str__()
+    # siglip does not support device_map = "auto"
+    vision_tower_name = parse_model_name_or_path(config, "vision_tower")
+    if "siglip" in vision_tower_name.lower():
+        kwargs["device_map"] = "cuda"
+class LlavaLlamaConfig(LlavaConfig):
+    model_type = "llava_llama"
+# class LlavaLlamaModel(PreTrainedModel):
+#     config_class = LlavaLlamaConfig
+#     main_input_name = "input_embeds"
+#     supports_gradient_checkpointing = True
+#     @classmethod
+#     def from_pretrained(
+#         cls,
+#         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+#         *model_args,
+#         config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+#         cache_dir: Optional[Union[str, os.PathLike]] = None,
+#         ignore_mismatched_sizes: bool = False,
+#         force_download: bool = False,
+#         local_files_only: bool = False,
+#         token: Optional[Union[str, bool]] = None,
+#         revision: str = "main",
+#         use_safetensors: bool = None,
+#         **kwargs,
+#     ):
+#         if hasattr(cls, "load_pretrained"):
+#             return cls.load_pretrained(
+#                 pretrained_model_name_or_path,
+#                 *model_args,
+#                 config=config,
+#                 cache_dir=cache_dir,
+#                 ignore_mismatched_sizes=ignore_mismatched_sizes,
+#                 force_download=force_download,
+#                 local_files_only=local_files_only,
+#                 token=token,
+#                 revision=revision,
+#                 use_safetensors=use_safetensors,
+#                 **kwargs,
+#             )
+#         return None
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+class LlavaMetaModel(ABC):
+    def init_vlm(self, config: PreTrainedModel = None, *args, **kwargs):
+        # TODO(ligeng): figure out how from_config and from_pretrained works in HF implementation.
+        if (
+            hasattr(self, "llm")
+            or hasattr(self, "vision_tower")
+            or hasattr(self, "mm_projector")
+        ):
+            # already initialized, skipped
+            return
+        model_dtype = getattr(config, "model_dtype", "torch.float16")
+        if not hasattr(config, "model_dtype"):
+            warnings.warn(
+                "model_dtype not found in config, defaulting to torch.float16."
+            )
+            config.model_dtype = model_dtype
+        cfgs = get_model_config(config)
+        if len(cfgs) == 3:
+            llm_cfg, vision_tower_cfg, mm_projector_cfg = cfgs
+        else:
+            raise ValueError(
+                "`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config."
+            )
+        self.llm, self.tokenizer = build_llm_and_tokenizer(
+            llm_cfg, config, *args, **kwargs
+        )
+        self.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        self.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        self.post_config()
+        self.is_loaded = True
+        assert (
+            self.llm is not None
+            or self.vision_tower is not None
+            or self.mm_projector is not None
+        ), "At least one of the components must be instantiated."
+    @classmethod
+    def load_from_config(cls, model_path_or_config, *args, **kwargs):
+        pass
+    ## FIXME we will use this function to load model in the future
+    @classmethod
+    def load_pretrained(cls, model_path_or_config, *args, **kwargs):
+        kwargs.pop("config", None)
+        if isinstance(model_path_or_config, str):
+            config = AutoConfig.from_pretrained(model_path_or_config)
+        elif isinstance(model_path_or_config, LlavaConfig):
+            config = model_path_or_config
+        else:
+            raise NotImplementedError(
+                f"wrong type, {type(model_path_or_config)} \
+                                      {isinstance(model_path_or_config, LlavaConfig)}"
+            )
+        model_dtype = getattr(config, "model_dtype", "torch.float16")
+        if not hasattr(config, "model_dtype"):
+            warnings.warn(
+                "model_dtype not found in config, defaulting to torch.float16."
+            )
+            config.model_dtype = model_dtype
+        cfgs = get_model_config(config)
+        if len(cfgs) == 3:
+            llm_cfg, vision_tower_cfg, mm_projector_cfg = cfgs
+        else:
+            raise ValueError(
+                "`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config."
+            )
+        vlm = cls(config, *args, **kwargs)
+        # print(llm_cfg, vision_tower_cfg, mm_projector_cfg); input("DEBUG load_pretrained finish")
+        if (
+            hasattr(vlm, "llm")
+            or hasattr(vlm, "vision_tower")
+            or hasattr(vlm, "mm_projector")
+        ):
+            if vlm.is_loaded:
+                return vlm
+        vlm.llm, vlm.tokenizer = build_llm_and_tokenizer(
+            llm_cfg, config, *args, **kwargs
+        )
+        vlm.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        vlm.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        cls.post_config()
+        cls.is_loaded = True
+        # FIXME(ligeng, yunhao): llm should never be none here.
+        assert (
+            vlm.llm is not None
+            or vlm.vision_tower is not None
+            or vlm.mm_projector is not None
+        ), "At least one of the components must be instantiated."
+        return vlm
+    ## FIXME we will use this function to save the model in the future
+    def save_pretrained(self, output_dir, state_dict=None):
+        if state_dict is None:
+            # other wise fetch from deepspeed
+            # state_dict = accelerator.get_state_dict(is_deepspeed_enabled)
+            state_dict = self.state_dict()
+        if getattr(self, "tokenizer", None):
+            self.tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        if self.get_llm():
+            print(f"saving llm to {osp.join(output_dir, 'llm')}")
+            self.llm.config._name_or_path = osp.join(output_dir, "llm")
+            llm_state_dict = OrderedDict(
+                {k.split("llm.")[-1]: v for k, v in state_dict.items() if "llm" in k}
+            )
+            self.llm.save_pretrained(
+                os.path.join(output_dir, "llm"), state_dict=llm_state_dict
+            )
+            self.config.llm_cfg = self.llm.config
+        if self.get_vision_tower():
+            print(f"saving vision_tower to {osp.join(output_dir, 'vision_tower')}")
+            self.vision_tower.config._name_or_path = osp.join(
+                output_dir, "vision_tower"
+            )
+            vision_tower_state_dict = OrderedDict(
+                {
+                    k.split("vision_tower.vision_tower.")[-1]: v
+                    for k, v in state_dict.items()
+                    if "vision_tower" in k
+                }
+            )
+            self.vision_tower.vision_tower.save_pretrained(
+                os.path.join(output_dir, "vision_tower"),
+                state_dict=vision_tower_state_dict,
+            )
+            self.vision_tower.image_processor.save_pretrained(
+                os.path.join(output_dir, "vision_tower")
+            )
+            self.config.vision_tower_cfg = self.vision_tower.config
+            if hasattr(self.config.vision_tower_cfg, "auto_map"):
+                if "radio" not in self.get_vision_tower().__class__.__name__.lower():
+                    delattr(self.config.vision_tower_cfg, "auto_map")
+        if self.get_mm_projector():
+            print(f"saving mm_projector to {osp.join(output_dir, 'mm_projector')}")
+            self.mm_projector.config._name_or_path = osp.join(
+                output_dir, "mm_projector"
+            )
+            mm_projector_state_dict = OrderedDict(
+                {
+                    k.split("mm_projector.")[-1]: v
+                    for k, v in state_dict.items()
+                    if "mm_projector" in k
+                }
+            )
+            self.mm_projector.save_pretrained(
+                os.path.join(output_dir, "mm_projector"),
+                state_dict=mm_projector_state_dict,
+            )
+            self.config.mm_projector_cfg = self.mm_projector.config
+        ## update and save top-level config
+        self.config._name_or_path = output_dir
+        self.config.architectures = [self.__class__.__name__]
+        self.config.save_pretrained(output_dir)
+    def get_llm(self):
+        llm = getattr(self, "llm", None)
+        if type(llm) is list:
+            llm = llm[0]
+        return llm
+    def get_lm_head(self):
+        lm_head = getattr(self.get_llm(), "lm_head", None)
+        return lm_head
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_mm_projector(self):
+        mm_projector = getattr(self, "mm_projector", None)
+        if type(mm_projector) is list:
+            mm_projector = mm_projector[0]
+        return mm_projector
+    def post_config(self):
+        self.training = self.get_llm().training
+        ## configuration
+        if getattr(self.config, "llm_cfg", None) is None:
+            self.config.llm_cfg = self.llm.config
+        if getattr(self.config, "vision_tower_cfg", None) is None:
+            self.config.vision_tower_cfg = self.vision_tower.config
+        if getattr(self.config, "mm_projector_cfg", None) is None:
+            self.config.mm_projector_cfg = self.mm_projector.config
+    def freezed_module_patch(self):
+        """
+        Huggingface will call model.train() at each training_step. To ensure the expected behaviors for modules like dropout, batchnorm, etc., we need to call model.eval() for the freezed modules.
+        """
+        if self.training:
+            if self.get_llm() and not getattr(
+                self.config, "tune_language_model", False
+            ):
+                pass
+                # logging.warning("Caution: Your LLM is currently in training mode, ensuring accurate gradient computation. Please be vigilant, particularly regarding BatchNorm and Dropout operations.")
+            if self.get_vision_tower() and not getattr(
+                self.config, "tune_vision_tower", False
+            ):
+                self.get_vision_tower().eval()
+            if self.get_mm_projector() and not getattr(
+                self.config, "tune_mm_projector", False
+            ):
+                self.get_mm_projector().eval()
+    def encode_images(self, images):
+        image_features = self.get_vision_tower()(images)
+        image_features = self.get_mm_projector()(image_features)
+        return image_features
+    ## @yunhao: is there a better way to handle function call and attributes for llm?
+    ## support beam search
+    def _temporary_reorder_cache(self, past_key_values, sorted_idx):
+        return self.get_llm()._temporary_reorder_cache(past_key_values, sorted_idx)
+    def get_input_embeddings(self):
+        return self.get_llm().get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.get_llm().get_output_embeddings()
+    def resize_token_embeddings(self, embed_size):
+        self.get_llm().resize_token_embeddings(embed_size)
+# ## FIXME we will follow the convention to add a new class for CausalLM in the future
+class LlavaLlamaModel(LlavaMetaModel, PreTrainedModel):
+    config_class = LlavaLlamaConfig
+    main_input_name = "input_embeds"
+    supports_gradient_checkpointing = True
+    def __init__(self, config: LlavaLlamaConfig = None, *args, **kwargs) -> None:
+        super().__init__(config)
+        return self.init_vlm(config=config, *args, **kwargs)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
+        if hasattr(cls, "load_pretrained"):
+            return cls.load_pretrained(
+                pretrained_model_name_or_path,
+                *model_args,
+                config=config,
+                cache_dir=cache_dir,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
+                force_download=force_download,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                use_safetensors=use_safetensors,
+                **kwargs,
+            )
+        return super(LlavaLlamaModel).from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            **kwargs,
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        images: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        seqlens_in_batch: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        dpo_forward: bool = False,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        self.freezed_module_patch()
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids, position_ids, attention_mask, past_key_values, labels, images
+            )
+        support_packing = (
+            "seqlens_in_batch" in inspect.signature(self.llm.forward).parameters
+        )
+        if self.training and support_packing and not dpo_forward:
+            (
+                _,
+                new_position_ids,
+                new_attention_mask,
+                _,
+                new_inputs_embeds,
+                new_labels,
+                sorted_seqlens_in_batch,
+            ) = self.repack_multimodal_data(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            )
+            if sorted_seqlens_in_batch is None:
+                sorted_seqlens_in_batch = seqlens_in_batch
+            new_input_ids = None
+            past_key_values = None
+        else:
+            new_attention_mask = attention_mask
+            new_position_ids = position_ids
+            new_inputs_embeds = inputs_embeds
+            new_labels = labels
+            sorted_seqlens_in_batch = attention_mask.sum(-1).int()
+            new_input_ids = input_ids
+        if support_packing:
+            outputs = self.llm.forward(
+                input_ids=new_input_ids,
+                attention_mask=new_attention_mask,
+                position_ids=new_position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=new_inputs_embeds,
+                labels=new_labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                seqlens_in_batch=sorted_seqlens_in_batch,
+            )
+        else:
+            outputs = self.llm.forward(
+                input_ids=new_input_ids,
+                attention_mask=new_attention_mask,
+                position_ids=new_position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=new_inputs_embeds,
+                labels=new_labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        if dpo_forward:
+            return outputs.logits, new_labels
+        return outputs
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.FloatTensor] = None,
+        images: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generation_kwargs,
+    ):
+        if images is not None:
+            (
+                _,
+                _,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids, None, attention_mask, None, None, images
+            )
+        else:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        inputs_embeds = inputs_embeds.to(self.dtype)
+        outputs = self.llm.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **generation_kwargs,
+        )
+        return outputs
+# AutoConfig.register("llava_llama", LlavaLlamaConfig)
+# AutoModel.register(LlavaLlamaConfig, LlavaLlamaModel)

llm/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "./llm",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 6912,
+  "max_position_embeddings": 4096,
+  "model_max_length": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 20,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 20,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 4096,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.36.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.36.2"
+}

llm/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4eed552fa9ca41f3d6fb14b59a481bf12137a37e964df0ec60f412b3ac2a8637
+size 4974521464

llm/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b63acc16bd9be4e7f900ba7e66ddc82400c3c12d77cd5c2cfa4bc77761c0732d
+size 428632856

llm/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+  "metadata": {
+    "total_size": 5403120640
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llm/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7aedb3582ecda9fa99ee9242c17a9658f6744db083ee6ebdc8fb14857f84d220
+size 499723

llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

mm_projector/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_name_or_path": "./checkpoints/vila-siglip-shearedllama2.7b-r129/mm_projector",
+  "architectures": [
+    "MultimodalProjector"
+  ],
+  "mm_projector_type": "mlp_downsample",
+  "model_type": "v2l_projector",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.36.2"
+}

mm_projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f912f7e3e83a967ef9f927ac2dd5275d3fe1bb62b2208075e4eff877fcf47ba7
+size 36729360

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import os
+import os.path as osp
+from huggingface_hub import repo_exists, snapshot_download
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+from transformers import AutoConfig, PretrainedConfig
+def get_model_config(config):
+    default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
+    if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
+        root_path = config._name_or_path
+    else:
+        root_path = config.resume_path
+    # download from huggingface
+    if root_path is not None and not osp.exists(root_path):
+        try:
+            valid_hf_repo = repo_exists(root_path)
+        except HFValidationError as e:
+            valid_hf_repo = False
+        if valid_hf_repo:
+            root_path = snapshot_download(root_path)
+    return_list = []
+    for key in default_keys:
+        cfg = getattr(config, key, None)
+        if isinstance(cfg, dict):
+            try:
+                return_list.append(os.path.join(root_path, key[:-4]))
+            except:
+                raise ValueError(f"Cannot find resume path in config for {key}!")
+        elif isinstance(cfg, PretrainedConfig):
+            return_list.append(os.path.join(root_path, key[:-4]))
+        elif isinstance(cfg, str):
+            return_list.append(cfg)
+    return return_list
+def is_mm_model(model_path):
+    """
+    Check if the model at the given path is a visual language model.
+    Args:
+        model_path (str): The path to the model.
+    Returns:
+        bool: True if the model is an MM model, False otherwise.
+    """
+    config = AutoConfig.from_pretrained(model_path)
+    architectures = config.architectures
+    for architecture in architectures:
+        if "llava" in architecture.lower():
+            return True
+    return False
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if "llava" in config and "llava" not in cfg.model_type:
+        assert cfg.model_type == "llama"
+        print(
+            "You are using newer LLaVA code base, while the checkpoint of v0 is from older code base."
+        )
+        print(
+            "You must upgrade the checkpoint to the new code base (this can be done automatically)."
+        )
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = "LlavaLlamaForCausalLM"
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)

vision_tower/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_name_or_path": "./vision_tower",
+  "architectures": [
+    "SiglipVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 384,
+  "intermediate_size": 4304,
+  "layer_norm_eps": 1e-06,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 27,
+  "patch_size": 14,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.36.2"
+}

vision_tower/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e3764125ad000414d381fe7eb6b222be9f0f2b4c14a55b22bf68cb29647d526
+size 856506120

vision_tower/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  }
+}