Spaces:

farrosalferro24
/

gecko

Build error

App Files Files Community

farrosalferro24 commited on Aug 1, 2024

Commit

09773e9

verified ·

1 Parent(s): 1de2fb0

Initial commit

Browse files

Files changed (23) hide show

.gitattributes +4 -0
chat_gecko.py +315 -0
examples/booth_yellowvan.jpg +3 -0
examples/booth_yellowvan.txt +5 -0
examples/bucket_cyclist.jpg +3 -0
examples/bucket_cyclist.txt +5 -0
examples/bus_luggage.jpg +3 -0
examples/bus_luggage.txt +5 -0
examples/little_girl.jpg +3 -0
examples/little_girl.txt +7 -0
model/__init__.py +4 -0
model/__pycache__/__init__.cpython-310.pyc +0 -0
model/__pycache__/configuration_gecko.cpython-310.pyc +0 -0
model/__pycache__/conversation.cpython-310.pyc +0 -0
model/__pycache__/modelling_gecko.cpython-310.pyc +0 -0
model/__pycache__/processing_gecko.cpython-310.pyc +0 -0
model/__pycache__/utils.cpython-310.pyc +0 -0
model/configuration_gecko.py +105 -0
model/conversation.py +527 -0
model/modelling_gecko.py +760 -0
model/multimodal_encoder.py +172 -0
model/processing_gecko.py +487 -0
model/utils.py +199 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/booth_yellowvan.jpg filter=lfs diff=lfs merge=lfs -text
+examples/bucket_cyclist.jpg filter=lfs diff=lfs merge=lfs -text
+examples/bus_luggage.jpg filter=lfs diff=lfs merge=lfs -text
+examples/little_girl.jpg filter=lfs diff=lfs merge=lfs -text

chat_gecko.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import gradio as gr
+import spaces
+import os
+import time
+from PIL import Image
+import functools
+import torch
+import matplotlib.pyplot as plt
+import re
+import ast
+from model import GeckoForConditionalGeneration, GeckoConfig, GeckoProcessor, chat_gecko, chat_gecko_stream
+from model.conversation import conv_templates
+from typing import List
+from io import StringIO
+import sys
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+# initialization
+topk = 1
+keyword_criteria = 'word'
+positional_information = 'explicit'
+vision_feature_select_strategy = 'cls'
+patch_picking_strategy = 'last_layer'
+cropping_method = 'naive'
+crop_size = 384
+visualize_topk_patches = False
+print_keyword=True
+print_topk_patches = True
+torch_dtype = torch.float16
+attn_implementation = 'flash_attention_2'
+device_map = 'cuda'
+conv_template = conv_templates['llama_3']
+model = 'TIGER-Lab/Mantis-8B-siglip-llama3'
+config = GeckoConfig.from_pretrained(model,
+                                    topk=topk,
+                                    visualize_topk_patches=visualize_topk_patches,
+                                    keyword_criteria=keyword_criteria,
+                                    positional_information=positional_information,
+                                    vision_feature_select_strategy=vision_feature_select_strategy,
+                                    patch_picking_strategy=patch_picking_strategy,
+                                    print_keyword=print_keyword)
+processor = GeckoProcessor.from_pretrained(model, config=config, use_keyword=True, cropping_method=cropping_method, crop_size=crop_size)
+model = GeckoForConditionalGeneration.from_pretrained(
+    model, config=config, torch_dtype=torch_dtype,
+    attn_implementation=attn_implementation, device_map=device_map)
+model.load_text_encoder(processor)
+@spaces.GPU
+def generate_stream(text:str, images:List[Image.Image], history: List[dict], **kwargs):
+    global processor, model
+    model = model.to("cuda")
+    if not images:
+        images = None
+    # print(history)
+    print(f'length of images: {len(images)}')
+    generator, print_kw, inputs = chat_gecko_stream(text, images, model, processor, history=history, **kwargs)
+    texts = []
+    # for text, history in chat_gecko_stream(text, images, model, processor, history=history, **kwargs):
+    #     yield text
+    for text, history in generator:
+        texts.append(text)
+    # return text
+    return texts, print_kw, inputs
+@spaces.GPU
+def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
+    global processor, model
+    model = model.to("cuda")
+    if not images:
+        images = None
+    generated_text, history = chat_gecko(text, images, model, processor, history=history, **kwargs)
+    return generated_text
+def enable_next_image(uploaded_images, image):
+    uploaded_images.append(image)
+    return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
+def add_message(history, message):
+    if message["files"]:
+        for file in message["files"]:
+            history.append([(file,), None])
+    if message["text"]:
+        history.append([message["text"], None])
+    return history, gr.MultimodalTextbox(value=None)
+def print_like_dislike(x: gr.LikeData):
+    print(x.index, x.value, x.liked)
+def get_chat_history(history):
+    chat_history = []
+    user_role = conv_template.roles[0]
+    assistant_role = conv_template.roles[1]
+    for i, message in enumerate(history):
+        if isinstance(message[0], str):
+            chat_history.append({"role": user_role, "text": message[0]})
+            if i != len(history) - 1:
+                assert message[1], "The bot message is not provided, internal error"
+                chat_history.append({"role": assistant_role, "text": message[1]})
+            else:
+                assert not message[1], "the bot message internal error, get: {}".format(message[1])
+                chat_history.append({"role": assistant_role, "text": ""})
+    return chat_history
+def get_chat_images(history):
+    images = []
+    for message in history:
+        if isinstance(message[0], tuple):
+            images.extend(message[0])
+    return images
+def bot(history, topk=None):
+    print(history)
+    cur_messages = {"text": "", "images": []}
+    for message in history[::-1]:
+        if message[1]:
+            break
+        if isinstance(message[0], str):
+            cur_messages["text"] = message[0] + " " + cur_messages["text"]
+        elif isinstance(message[0], tuple):
+            cur_messages["images"].extend(message[0])
+    cur_messages["text"] = cur_messages["text"].strip()
+    cur_messages["images"] = cur_messages["images"][::-1]
+    if not cur_messages["text"]:
+        raise gr.Error("Please enter a message")
+    if cur_messages['text'].count("<image>") < len(cur_messages['images']):
+        gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
+        cur_messages['text'] = "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>")) + cur_messages['text']
+        history[-1][0] = cur_messages["text"]
+    if cur_messages['text'].count("<image>") > len(cur_messages['images']):
+        gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
+        cur_messages['text'] = cur_messages['text'][::-1].replace("<image>"[::-1], "", cur_messages['text'].count("<image>") - len(cur_messages['images']))[::-1]
+        history[-1][0] = cur_messages["text"]
+    chat_history = get_chat_history(history)
+    chat_images = get_chat_images(history)
+    generation_kwargs = {
+        "max_new_tokens": 4096,
+        "num_beams": 1,
+        "do_sample": False,
+        "topk": topk,
+    }
+    response = generate_stream(None, chat_images, chat_history, **generation_kwargs)
+    num_images = len(response[2].pixel_values)
+    coords = response[1][-num_images:]
+    print_kw = '\n'.join(response[1][:-num_images-1])
+    patches_fig = plot_patches(response[2])
+    topk_patches_fig = plot_topk_patches(response[2], coords)
+    for _output in response[0]:
+        history[-1][1] = _output
+        time.sleep(0.05)
+        yield history, print_kw, patches_fig, topk_patches_fig
+def plot_patches(inputs):
+    pixel_value = inputs.pixel_values[0].cpu().numpy()
+    x, y = inputs.coords[0][-1][0] + 1, inputs.coords[0][-1][1] + 1
+    fig, axes = plt.subplots(y, x, figsize=(x * 4, y * 4))
+    for i in range(y):
+        for j in range(x):
+            axes[i, j].imshow(pixel_value[1 + i * x + j].transpose(1, 2, 0))
+            axes[i, j].axis('off')
+    return fig
+def plot_topk_patches(inputs, selected_coords):
+    selected_coords_list = []
+    for selected_coord in selected_coords:
+        match = re.search(r"\[\[.*\]\]", selected_coord)
+        if match:
+            coordinates_str = match.group(0)
+            # Convert the string representation of the list to an actual list
+            coordinates = ast.literal_eval(coordinates_str)
+            selected_coords_list.append(coordinates)
+    num_images = len(selected_coords_list)
+    fig, axes = plt.subplots(num_images, len(selected_coords_list[0])+1, figsize=((len(selected_coords_list[0])+1) * 10, num_images * 10))
+    if num_images == 1:
+        xmax = inputs.coords[0][-1][0] + 1
+        for j in range(len(selected_coords_list[0])+1):
+            if j == 0:
+                axes[j].imshow(inputs.pixel_values[0][0].cpu().numpy().transpose(1, 2, 0))
+                axes[j].axis('off')
+                continue
+            x, y = selected_coords_list[0][j-1][0], selected_coords_list[0][j-1][1]
+            axes[j].imshow(inputs.pixel_values[0][1 + y * xmax + x].cpu().numpy().transpose(1, 2, 0))
+            axes[j].axis('off')
+    else:
+        for i in range(num_images):
+            xmax = inputs.coords[i][-1][0] + 1
+            for j in range(len(selected_coords_list[0])+1):
+                if j == 0:
+                    axes[i, j].imshow(inputs.pixel_values[i][0].cpu().numpy().transpose(1, 2, 0))
+                    continue
+                x, y = selected_coords_list[i][j-1][0], selected_coords_list[i][j-1][1]
+                axes[i, j].imshow(inputs.pixel_values[i][1 + y * xmax + x].cpu().numpy().transpose(1, 2, 0))
+                axes[i, j].axis('off')
+    return fig
+def build_demo():
+    with gr.Blocks() as demo:
+#         gr.Markdown(""" # Mantis
+# Mantis is a multimodal conversational AI model that can chat with users about images and text. It's optimized for multi-image reasoning, where inverleaved text and images can be used to generate responses.
+# ### [Paper](https://arxiv.org/abs/2405.01483) | [Github](https://github.com/TIGER-AI-Lab/Mantis) | [Models](https://huggingface.co/collections/TIGER-Lab/mantis-6619b0834594c878cdb1d6e4) | [Dataset](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct) | [Website](https://tiger-ai-lab.github.io/Mantis/)
+#         """)
+#         gr.Markdown("""## Chat with Mantis
+#         Mantis supports interleaved text-image input format, where you can simply use the placeholder `<image>` to indicate the position of uploaded images.
+#         The model is optimized for multi-image reasoning, while preserving the ability to chat about text and images in a single conversation.
+#         (The model currently serving is [🤗 TIGER-Lab/Mantis-8B-siglip-llama3](https://huggingface.co/TIGER-Lab/Mantis-8B-siglip-llama3))
+#         """)
+        chatbot = gr.Chatbot(line_breaks=True)
+        chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload images. Please use <image> to indicate the position of uploaded images", show_label=True)
+        chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
+        print_kw = gr.Textbox(label="keywords")
+        depict_patches = gr.Plot(label="image patches", format="png")
+        depict_topk_patches = gr.Plot(label="top-k image patches", format="png")
+        # with gr.Accordion(label='Advanced options', open=False):
+            # temperature = gr.Slider(
+            #     label='Temperature',
+            #     minimum=0.1,
+            #     maximum=2.0,
+            #     step=0.1,
+            #     value=0.2,
+            #     interactive=True
+            # )
+            # top_p = gr.Slider(
+            #     label='Top-p',
+            #     minimum=0.05,
+            #     maximum=1.0,
+            #     step=0.05,
+            #     value=1.0,
+            #     interactive=True
+            # )
+        topk = gr.Slider(
+            label='Top-k',
+            minimum=1,
+            maximum=10,
+            step=1,
+            value=1,
+            interactive=True)
+        bot_msg = chat_msg.success(bot, chatbot,
+                                         chatbot, api_name="bot_response")
+        chatbot.like(print_like_dislike, None, None)
+        with gr.Row():
+            send_button = gr.Button("Send")
+            clear_button = gr.ClearButton([chatbot, chat_input])
+        send_button.click(
+            add_message, [chatbot, chat_input], [chatbot, chat_input]
+        ).then(
+            bot,
+            [chatbot, topk],
+            [chatbot, print_kw, depict_patches, depict_topk_patches], api_name="bot_response"
+        )
+        gr.Examples(
+            examples=[
+                {
+                    "text": open("gradio/examples/little_girl.txt").read(),
+                    "files": ["gradio/examples/little_girl.jpg"]
+                },
+                {
+                    "text": open("gradio/examples/bus_luggage.txt").read(),
+                    "files": ["gradio/examples/bus_luggage.jpg"]
+                },
+            ],
+            inputs=[chat_input],
+        )
+#         gr.Markdown("""
+# ## Citation
+# ```
+# @article{jiang2024mantis,
+#   title={MANTIS: Interleaved Multi-Image Instruction Tuning},
+#   author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
+#   journal={arXiv preprint arXiv:2405.01483},
+#   year={2024}
+# }
+# ```""")
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch(share=False)

examples/booth_yellowvan.jpg ADDED Viewed

Git LFS Details

SHA256: 26560e898003ef04f807bd997744a1b15b2cfd1235b15069206808d6ce38932f
Pointer size: 132 Bytes
Size of remote file: 1.65 MB

examples/booth_yellowvan.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+<image>
+Is the telephone booth on the left or right side of the yellow van?
+(A) right
+(B) left
+Answer with the option's letter from the given choices directly.

examples/bucket_cyclist.jpg ADDED Viewed

Git LFS Details

SHA256: 4f170930283c75eafb65efa7cf3b6531417d59a2dbfe69d82d2b8cbf96f0a508
Pointer size: 132 Bytes
Size of remote file: 1.7 MB

examples/bucket_cyclist.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+<image>
+Is the bucket on the left or right side of the cyclist?
+(A) right
+(B) left
+Answer with the option's letter from the given choices directly.

examples/bus_luggage.jpg ADDED Viewed

Git LFS Details

SHA256: 6f56c068c6b9b03743d7650406c6560dcdc81c26349b47b3bed65d2902cdd842
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

examples/bus_luggage.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+<image>
+Is the blue luggage on the left or right side of the bus?
+(A) right
+(B) left
+Answer with the option's letter from the given choices directly.

examples/little_girl.jpg ADDED Viewed

Git LFS Details

SHA256: 847ba9aa5edd9f28c2d40d4a39c7a74bc9a4cb5918ec7627f1b70f46d5fe954a
Pointer size: 132 Bytes
Size of remote file: 1.57 MB

examples/little_girl.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+<image>
+What is the color of the little girl's shirt?
+(A) yellow
+(B) pink
+(C) white
+(D) black
+Answer with the option's letter from the given choices directly.

model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .modelling_gecko import GeckoForConditionalGeneration
+from .processing_gecko import GeckoProcessor
+from .configuration_gecko import GeckoConfig
+from .utils import chat_gecko, chat_gecko_stream

model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (409 Bytes). View file

model/__pycache__/configuration_gecko.cpython-310.pyc ADDED Viewed

Binary file (3.8 kB). View file

model/__pycache__/conversation.cpython-310.pyc ADDED Viewed

Binary file (14.6 kB). View file

model/__pycache__/modelling_gecko.cpython-310.pyc ADDED Viewed

Binary file (24.3 kB). View file

model/__pycache__/processing_gecko.cpython-310.pyc ADDED Viewed

Binary file (16.9 kB). View file

model/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (6.13 kB). View file

model/configuration_gecko.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+logger = logging.get_logger(__name__)
+class GeckoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
+    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llava-9B.
+    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`LlavaVisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the CLIP backbone.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
+    """
+    model_type = "gecko"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="cls",
+        patch_picking_strategy="across_layers",
+        vision_feature_layer=-2,
+        vocab_size=32000,
+        topk=4,
+        keyword_criteria="template",
+        positional_information="explicit",
+        visualize_patches=False,
+        visualize_topk_patches=False,
+        print_keyword=False,
+        print_topk_patches=False,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_layer = vision_feature_layer
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.patch_picking_strategy = patch_picking_strategy
+        self.vocab_size = vocab_size
+        self.topk = topk
+        self.vision_config = vision_config
+        self.text_config = text_config
+        self.keyword_criteria = keyword_criteria
+        self.positional_information = positional_information
+        self.visualize_patches = visualize_patches
+        self.visualize_topk_patches = visualize_topk_patches
+        self.print_keyword = print_keyword
+        self.print_topk_patches = print_topk_patches
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+        self.vocab_size = self.vocab_size
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            self.vocab_size = self.text_config.vocab_size
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["llama"]()
+        super().__init__(**kwargs)

model/conversation.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    MFuyu = auto()
+    PHI_3 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n" + message + self.sep
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+        elif self.sep_style == SeparatorStyle.MFuyu:
+            seps = [self.sep, self.sep2]
+            ret = self.system + "\n"
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        elif self.sep_style == SeparatorStyle.PHI_3:
+            ret = self.system + self.sep + '\n'
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += f"<|{role}|>\n" + message + self.sep + '\n'
+                else:
+                    ret += f"<|{role}|>\n"
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def generate_keyword_prompt(self, messages=None):
+        messages = messages if messages is not None else self.messages[-2][1]
+        system_prompt = """Use the text below as an example to generate your answers to the user's query. Give the answer in the same format.
+Example starts:
+```
+User: What is/are the object(s) that being asked in below question? Also give some useful visual features that best describes each object in a photo.
+'What kind of drink can we buy from that vending machine?'
+Assistant: The object being asked is vending machine. Several visual features of the object are:
+'vending machine':
+* typically have a large, upright, rectangular shape.
+* usually have a large glass or transparent plastic front
+* often feature logos, product images, and labels on their exterior
+* Most are metallic and have a dominant color (often bright or neutral)
+```
+Example ends
+Example starts:
+```
+User: What is/are the object(s) that being asked in below question? Also give some useful visual features that best describes each object in a photo.
+'Is the wallet on the left or right side of the keyboard?'
+Assistant: The objects being asked are wallet and keyboard. Several visual features of the objects are:
+'wallet':
+* typically have a compact, flat, rectangular shape.
+* can be made from various materials including leather, synthetic fabric, or even metal for hard cases.
+* generally small enough to fit in a pocket or a small bag.
+* come in a wide range of colors, from classic black or brown to vibrant hues and patterns.
+'keyboard':
+* typically feature a rectangular array of keys in a grid layout.
+* can be made from plastic, metal, or other materials.
+* come in various colors, although black and white are most common.
+* may have a visible USB cable or may be identified as wireless if there is no cable connected.
+```
+Example ends
+Please generate answer in the SAME FORMAT as shown in the above examples. Your response must have an equal number of features for each object in the question.
+Please ensure to cover all significant visual features.
+"""
+        user_prompt = f"""What is/are the object(s) that being asked in below question? Also give some useful visual features that best describes each object in a photo.
+'{messages}'"""
+        prompt = f"""<|start_header_id|>system<|end_header_id|>
+{system_prompt}{self.sep}
+<|start_header_id|>user<|end_header_id|>
+{user_prompt}
+<|start_header_id|>assistant<|end_header_id|>"""
+        return prompt
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mfuyu_v1 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MFuyu,
+    sep="<0x04>", # begin of answer token
+    sep2="|ENDOFTEXT|",
+) # copied from conv_vicuna_v1
+conv_mllava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the multiple visual contents that the user provides, and assist the user with a variety of tasks using natural language."
+           "Each visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="</s>",
+    version="v1_mmtag",
+)
+conv_mllava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="</s>",
+)
+conv_llama_3 = Conversation(
+    system="<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("user", "assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
+conv_phi_3 = Conversation(
+    system='<s><|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.',
+    roles=('<|user|>', '<|assistant|>'),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PHI_3,
+    sep='<|end|>'
+)
+default_conversation = conv_mfuyu_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "llama_3": conv_llama_3,
+    "mllava_v1": conv_mllava_v1,
+    "mllava_v1_mmtag": conv_mllava_v1_mmtag,
+    "phi_3": conv_phi_3,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

model/modelling_gecko.py ADDED Viewed

	@@ -0,0 +1,760 @@

+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union, Dict
+from copy import deepcopy
+import re
+import math
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import matplotlib.pyplot as plt
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import ModelOutput
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from .configuration_gecko import GeckoConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "GeckoConfig"
+@dataclass
+class GeckoCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Llava causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class GeckoPreTrainedModel(PreTrainedModel):
+    config_class = GeckoConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GeckoVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        std = (
+            self.config.intializer_range if hasattr(self.config, "intializer_range") else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self):
+        return self.language_model._supports_sdpa
+class PositionalEncoding2D(nn.Module):
+    def __init__(self, config: GeckoConfig):
+        """
+        :param channels: The last dimension of the tensor you want to apply pos emb to.
+        """
+        super(PositionalEncoding2D, self).__init__()
+        if config.positional_information == "2d_before":
+            channels = config.vision_config.hidden_size
+        else:
+            channels = config.text_config.hidden_size
+        self.org_channels = channels
+        channels = int(math.ceil(channels / 4) * 2)
+        self.channels = channels
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
+        self.register_buffer("inv_freq", inv_freq)
+        self.register_buffer("cached_penc", None, persistent=False)
+    def get_emb(self, sin_inp):
+        """
+        Gets a base embedding for one dimension with sin and cos intertwined
+        """
+        emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
+        return torch.flatten(emb, -2, -1)
+    def forward(self, tensor):
+        """
+        :param tensor: A 4d tensor of size (x, y, num_tokens, ch)
+        :return: Positional Encoding Matrix of size (x, y, num_tokens, ch)
+        """
+        if len(tensor.shape) != 4:
+            raise RuntimeError("The input tensor has to be 4d!")
+        if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
+            return self.cached_penc
+        self.cached_penc = None
+        x, y, num_tokens, orig_ch = tensor.shape
+        pos_x = torch.arange(x, device=tensor.device, dtype=self.inv_freq.dtype)
+        pos_y = torch.arange(y, device=tensor.device, dtype=self.inv_freq.dtype)
+        sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
+        sin_inp_y = torch.einsum("i,j->ij", pos_y, self.inv_freq)
+        emb_x = self.get_emb(sin_inp_x).unsqueeze(1)
+        emb_y = self.get_emb(sin_inp_y)
+        emb = torch.zeros(
+            (x, y, self.channels * 2),
+            device=tensor.device,
+            dtype=tensor.dtype,
+        )
+        emb[:, :, : self.channels] = emb_x
+        emb[:, :, self.channels : 2 * self.channels] = emb_y
+        self.cached_penc = emb[:, :, None, :orig_ch].repeat(1, 1, num_tokens, 1)
+        return self.cached_penc
+class GeckoMultiModalProjector(nn.Module):
+    def __init__(self, config: GeckoConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class GeckoForConditionalGeneration(GeckoPreTrainedModel):
+    def __init__(self, config: GeckoConfig, vision_tower=None, language_model=None, multimodal_projector=None):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config) if vision_tower is None else vision_tower
+        self.positional_encoding = PositionalEncoding2D(config) if '2d' in config.positional_information else None
+        self.multi_modal_projector = GeckoMultiModalProjector(config)
+        self.vocab_size = config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        ) if language_model is None else language_model
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+    def load_text_encoder(self, processor):
+        self.tokenizer = processor.tokenizer
+        self.clip_tokenizer = processor.clip_tokenizer
+        self.eos_token_id = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+        self.encoder_type = self.config.vision_config.model_type
+        if 'clip' in self.encoder_type:
+            self.encoder = AutoModel.from_pretrained('openai/clip-vit-large-patch14-336', torch_dtype=self.dtype, device_map=self.device)
+        elif 'siglip' in self.encoder_type:
+            self.encoder = AutoModel.from_pretrained("google/siglip-so400m-patch14-384", torch_dtype=self.dtype, device_map=self.device)
+        else:
+            raise ValueError(f"Vision model {self.config.vision_config.model_type} is not supported.")
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    # def _get_highest_similarity(self, cls_token, keyword_hidden_states, top_patches):
+    #     num_patches, embed_dim = cls_token.shape
+    #     batch_size, sequence_length, hidden_size = keyword_hidden_states.shape
+    #     assert embed_dim == hidden_size, f"The embedding dimension of cls token and keyword hidden states do not match. Dimension of cls token: {embed_dim} and dimension of keyword hidden states: {hidden_size}."
+    #     keyword_hidden_states = keyword_hidden_states.squeeze(0)
+    #     # calculate the similarity between the cls token and the keyword hidden states
+    #     similarity_score = torch.matmul(cls_token, keyword_hidden_states.T) # shape: (num_patches, sequence_length)
+    #     similarity_score = similarity_score.mean(dim=1) # shape: (num_patches)
+    #     # take the index of the patch with the highest similarity score
+    #     patch_index = torch.topk(similarity_score, top_patches).indices
+    #     return patch_index
+    # def _select_patches(self, image_features, keyword_hidden_states, top_patches=1):
+    #     selected_patches = []
+    #     # iterate through each image
+    #     for image in image_features:
+    #         if keyword_hidden_states is not None:
+    #             # take the first token of each patch
+    #             cls_token = image[:, 0, :].squeeze(1)
+    #             # get the index of the patch with the highest similarity score
+    #             patch_index = self._get_highest_similarity(cls_token, keyword_hidden_states, top_patches)
+    #         else:
+    #             top_patches = image.shape[0]
+    #             patch_index = torch.arange(top_patches)
+    #         # select the patch with the highest similarity score
+    #         if self.multimodal_projector == 'mlp':
+    #             image = image[patch_index, 1:, :].reshape(-1, image.shape[-1]).type(self.vision_tower.dtype)
+    #         elif self.multimodal_projector == 'perceiver':
+    #             image = image[patch_index, :, :].reshape(-1, image.shape[-1]).type(self.vision_tower.dtype)
+    #         else:
+    #             raise ValueError(f"Multimodal projector {self.multimodal_projector} is not supported.")
+    #         selected_patches.append(image)
+    #     return selected_patches # shape: list with shape of num_images, each element of shape (num_tokens * num_patches_i, embed_dim)
+    # def _input_to_vision_tower(self, pixel_values):
+    #     output = []
+    #     for i in range(len(pixel_values)):
+    #         num_patches = pixel_values[i].shape[0]
+    #         pixel_batch_size = 2
+    #         processed_pixel_values
+    # def _input_to_multimodal_projector(self, selected_image_features):
+    #     output = []
+    #     for selected_image in selected_image_features:
+    #         selected_image = self.multi_modal_projector(selected_image)
+    #         output.append(selected_image)
+    #     return output # shape: list with shape of num_images, each element of shape (num_patches_i, num_tokens, embed_dim) where i is the index of the image
+    # def _process_keyword_input(self, keyword_input_ids, maximum_keyword_tokens=10):
+    #     self.language_model.eval()
+    #     with torch.no_grad():
+    #         output_ids = self.language_model.generate(input_ids=keyword_input_ids, return_dict_in_generate=True, max_new_tokens=maximum_keyword_tokens)
+    #         output_ids = output_ids.sequences[:, keyword_input_ids.shape[-1]:]
+    #     self.language_model.train()
+    #     # conditions
+    #     if output_ids[0, 0:2].tolist() == [35581, 25]: # condition where the output is in the form Keyword: <keyword>
+    #         keyword_ids = output_ids[:, 2:-1]
+    #         if keyword_ids[0, 0].item() == 482:
+    #             return None
+    #         return self.get_input_embeddings()(keyword_ids)
+    #     else: # output
+    #         return None
+    def generate_keywords(self, keywords_text, criteria='template'):
+        keywords_text = keywords_text.lstrip('\n')
+        first_sentence = keywords_text.split('.')[0] + '.'
+        if re.search(r'are (.+?)\.', first_sentence):
+            objects = re.search(r'are (.+?)\.', first_sentence).group(1).split(' and ')
+        elif re.search(r'is (.+?)\.', first_sentence):
+            objects = [re.search(r'is (.+?)\.', first_sentence).group(1)]
+        else:
+            objects = []
+        def generate_template(object, description):
+            if object[0] in ['a', 'e', 'i', 'o', 'u']:
+                return f'An {object}, which {description}'
+            else:
+                return f'A {object}, which {description}'
+        descriptions = []
+        keywords = []
+        for i, obj in enumerate(objects):
+            keywords.append(obj)
+            if criteria == 'word':
+                descriptions.append([obj])
+            elif criteria == 'template':
+                descriptions.append([f'a photo of {obj}'])
+            elif criteria == 'description':
+                # pattern = rf"'{obj}':(.*?)('|\Z)"
+                # match = re.search(pattern, keywords_text, re.DOTALL)
+                # if match:
+                #     # Extract the feature keywords_text and clean it up
+                #     feature_text = match.group(1).strip()
+                #     # Split on new lines and strip each line
+                #     feature_list = [generate_template(obj, line.strip('* ').strip()) for line in feature_text.split('\n') if line.strip()]
+                #     descriptions.append(feature_list)
+                # The problem of the above code is that it does not work for the case where the object is not found in the text
+                # make it more general
+                features = re.findall(r"\* (.+)", keywords_text, re.MULTILINE)
+                descriptions.append([generate_template(obj, feature) for feature in features[i * len(features) // len(objects): (i + 1) * len(features) // len(objects)]])
+            else:
+                raise ValueError(f'invalid criteria: {criteria}')
+        return keywords, descriptions
+    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+        num_images = len(image_features)
+        num_image_tokens = torch.tensor([x.shape[0] for x in image_features], device=self.vision_tower.device, dtype=torch.int64) # total image tokens
+        embed_dim = image_features[0].shape[-1]
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        # num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        # max_embed_dim = (num_special_image_tokens.max() * (num_image_tokens - 1)) + sequence_length
+        max_embed_dim = torch.sum(num_image_tokens) - num_images + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+        _, image_indices = torch.where(input_ids == self.config.image_token_index)
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        image_token_mask = special_image_token_mask * 1
+        image_token_mask[0, image_indices] = num_image_tokens - 1
+        # for i, index in enumerate(image_indices):
+        #     special_image_token_mask[0, index] = num_image_tokens[i] - 1
+        new_token_positions = torch.cumsum((image_token_mask) + 1, -1) - 1
+        # new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if image_to_overwrite.sum() != torch.sum(num_image_tokens):
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+        final_embedding[image_to_overwrite] = torch.cat([image_patches for image_patches in image_features], dim=0).to(target_device)
+        # final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        if labels is None:
+            final_labels = None
+        return final_embedding, final_attention_mask, final_labels, position_ids
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: List[torch.FloatTensor] = None,
+        coords: List[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        keyword_prompt_input_ids: torch.LongTensor = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        vision_feature_layer: Optional[int] = None,
+        patch_picking_strategy: Optional[str] = None,
+        topk: Optional[int] = None,
+        keyword_criteria: Optional[str] = None,
+        positional_information: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        visualize_patches: Optional[bool] = None,
+        visualize_topk_patches: Optional[bool] = None,
+        print_keyword: Optional[bool] = None,
+        print_topk_patches: Optional[bool] = None,
+    ) -> Union[Tuple, GeckoCausalLMOutputWithPast]:
+        """
+        Parameters:
+            text_inputs: Dict
+                Output of tokenizer for text data. A dictionary containing the following keys:
+                    - input_ids: torch.LongTensor of shape (batch_size, sequence_length)
+                    - attention_mask: torch.LongTensor of shape (batch_size, sequence_length)
+                    - token_type_ids: torch.LongTensor of shape (batch_size, sequence_length)
+            keyword_inputs: Dict
+                Output of tokenizer for keyword data. A dictionary containing the following keys:
+                    - input_ids: torch.LongTensor of shape (batch_size, sequence_length)
+                    - attention_mask: torch.LongTensor of shape (batch_size, sequence_length)
+                    - token_type_ids: torch.LongTensor of shape (batch_size, sequence_length)
+            image_inputs: Dict
+                Output of ImageProcessor for image data. A dictionary containing the following keys:
+                    - pixel_values: torch.FloatTensor of shape (num_images, num_patches, num_tokens, embed_dim)
+                    - coords: List of shape (batch_size, num_images)
+        """
+        # processing image and text inputs
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy if vision_feature_select_strategy is not None else self.config.vision_feature_select_strategy
+        )
+        patch_picking_strategy = patch_picking_strategy if patch_picking_strategy is not None else self.config.patch_picking_strategy
+        topk = topk if topk is not None else self.config.topk
+        keyword_criteria = keyword_criteria if keyword_criteria is not None else self.config.keyword_criteria
+        positional_information = positional_information if positional_information is not None else self.config.positional_information
+        visualize_patches = visualize_patches if visualize_patches is not None else self.config.visualize_patches
+        visualize_topk_patches = visualize_topk_patches if visualize_topk_patches is not None else self.config.visualize_topk_patches
+        print_keyword = print_keyword if print_keyword is not None else self.config.print_keyword
+        print_topk_patches = print_topk_patches if print_topk_patches is not None else self.config.print_topk_patches
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1:
+                with torch.no_grad():
+                    keyword_input_ids = self.language_model.generate(keyword_prompt_input_ids, return_dict_in_generate=True, max_new_tokens=1024, eos_token_id=self.eos_token_id)
+                    keyword_input_ids = keyword_input_ids.sequences[:, keyword_prompt_input_ids.shape[-1]:]
+                keyword_text = self.tokenizer.decode(keyword_input_ids[0], skip_special_tokens=True)
+                # print(keyword_text)
+                generated_keywords, generated_descriptions = self.generate_keywords(keyword_text, criteria=keyword_criteria)
+                all_text_features = []
+                for descriptions in generated_descriptions:
+                    one_text_features = []
+                    for description in descriptions:
+                        keyword_ids = self.clip_tokenizer(description, return_tensors='pt')
+                        keyword_ids = {k: v.to(self.device) for k, v in keyword_ids.items()}
+                        text_features = self.encoder.get_text_features(**keyword_ids)
+                        one_text_features.append(text_features / text_features.norm(p=2, dim=-1, keepdim=True))
+                    all_text_features.append(torch.cat(one_text_features, dim=0))
+                pixel_values = [pixel_value.to(self.vision_tower.device, dtype=self.vision_tower.dtype) for pixel_value in pixel_values]
+                selected_image_features = []
+                selected_coords = []
+                for p, pixel_value in enumerate(pixel_values): # iterate through each image
+                    print_keyword_text = f'Keywords (criteria: {keyword_criteria}):\n'
+                    all_hidden_states = self.vision_tower(pixel_value, output_hidden_states=True).hidden_states # tuple of size (num_layers, num_patch, num_tokens, vison_embed_dim)
+                    if patch_picking_strategy == 'last_layer':
+                        hidden_states = [all_hidden_states[-1]]
+                    elif patch_picking_strategy == 'across_layers':
+                        hidden_states = deepcopy(all_hidden_states)
+                    top_patches = [0]
+                    for i, text_feature in enumerate(all_text_features):
+                        print_keyword_text += f'  {i+1}: ' + "\n     ".join(generated_descriptions[i]) + '\n'
+                        top_index = []
+                        for hidden_state in hidden_states: # iterate through each layer
+                            if 'clip' in self.encoder_type:
+                                if vision_feature_select_strategy == 'cls':
+                                    image_features = self.encoder.visual_projection(self.encoder.vision_model.post_layernorm(hidden_state[1:, 0, :])) # (num_patch-1, embed_dim)
+                                elif vision_feature_select_strategy == 'image_features':
+                                    image_features = self.encoder.visual_projection(self.encoder.vision_model.post_layernorm(hidden_state[1:, 1:, :])) # (num_patch-1 * num_tokens, embed_dim)
+                                num_tokens = hidden_state.shape[1] - 1
+                            elif 'siglip' in self.encoder_type:
+                                if vision_feature_select_strategy == 'cls':
+                                    image_features = self.encoder.vision_model.head(self.encoder.vision_model.post_layernorm(hidden_state[1:, :, :])) # (num_patch-1, embed_dim)
+                                elif vision_feature_select_strategy == 'image_features':
+                                    image_features = self.encoder.vision_model.post_layernorm(hidden_state[1:, :, :]) # (num_patch-1 * num_tokens, embed_dim)
+                                num_tokens = hidden_state.shape[1]
+                            image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+                            if vision_feature_select_strategy == 'cls':
+                                similarity_score = torch.matmul(image_features, text_feature.T).mean(dim=1) # (num_patch-1)
+                                if patch_picking_strategy == 'across_layers':
+                                    index = torch.topk(similarity_score, 1).indices
+                                    top_index.append(index.item()+1)
+                                elif patch_picking_strategy == 'last_layer':
+                                    index = torch.topk(similarity_score, math.ceil(topk / len(all_text_features))).indices + 1 # take top k patches
+                                    top_index += index.tolist()
+                            elif vision_feature_select_strategy == 'image_features':
+                                image_features = image_features.flatten(0, 1)
+                                similarity_score = torch.matmul(image_features, text_feature.T).mean(dim=1) # (num_patch-1 * num_tokens)
+                                index = torch.topk(similarity_score, 100).indices # take top 100 tokens
+                                patch_index = torch.floor(index / num_tokens) # get the patch index
+                                count = torch.nn.functional.one_hot(patch_index.to(torch.int64)).sum(dim=0) # count the occurrences of each patch
+                                if patch_picking_strategy == 'across_layers':
+                                    top_count = torch.topk(count, 1).indices # take top 1
+                                    top_index.append(top_count.item()+1)
+                                elif patch_picking_strategy == 'last_layer':
+                                    top_count = torch.topk(count, math.ceil(topk / len(all_text_features))).indices + 1
+                                    top_index += top_count.tolist()
+                        if visualize_patches and patch_picking_strategy == 'across_layers':
+                            if 'clip' in self.encoder_type:
+                                (x, y) = (5, 5)
+                            elif 'siglip' in self.encoder_type:
+                                (x, y) = (7, 4)
+                            fig, axs = plt.subplots(y, x, figsize=(x * 2, y * 2))
+                            fig.suptitle(f'keyword: {generated_keywords[i]}')
+                            for k, index in enumerate(top_index):
+                                axs[k // x, k % x].imshow(pixel_value[index].to(torch.float32).cpu().numpy().transpose(1, 2, 0))
+                                axs[k // x, k % x].set_title(f'Layer {k}')
+                                axs[k // x, k % x].axis('off')
+                            plt.show()
+                        if patch_picking_strategy == 'across_layers':
+                            top_patches += torch.topk(torch.bincount(torch.tensor(top_index, dtype=torch.int64)), math.ceil(topk / len(all_text_features))).indices.to(dtype=torch.int64).tolist()
+                        elif patch_picking_strategy == 'last_layer':
+                            top_patches += top_index
+                    topk_patches = list(set(top_patches))
+                    if visualize_topk_patches:
+                        fig, axs = plt.subplots(1, len(topk_patches), figsize=(len(topk_patches) * 2, 2))
+                        fig.suptitle(f'top-{len(topk_patches)} patches')
+                        for m, topk_patch in enumerate(topk_patches):
+                            axs[m].imshow(pixel_value[topk_patch].to(torch.float32).cpu().numpy().transpose(1, 2, 0))
+                            axs[m].axis('off')
+                        plt.show()
+                    if 'clip' in self.encoder_type:
+                        selected_image_features.append(all_hidden_states[vision_feature_layer][topk_patches, 1:, :])
+                    elif 'siglip' in self.encoder_type:
+                        selected_image_features.append(all_hidden_states[vision_feature_layer][topk_patches, :, :])
+                    selected_coords.append([coords[p][q-1] for q in topk_patches[1:]])
+                # if isinstance(pixel_values, list):
+                #     pixel_values = torch.cat([x for x in pixel_values if x is not None], dim=0)
+                if print_keyword:
+                    print(print_keyword_text)
+                multimodal_projector_features = []
+                for x, (selected_image_feature, selected_coord) in enumerate(zip(selected_image_features, selected_coords)):
+                    print(f'image {x+1}: {selected_coord}')
+                    if '2d' in positional_information:
+                        max_width = max(selected_coord, key= lambda x: x[0])[0] + 1
+                        max_height = max(selected_coord, key= lambda x: x[1])[1] + 1
+                        positional_encoding = self.positional_encoding(torch.ones((max_width, max_height, selected_image_feature.shape[1], self.positional_encoding.org_channels), dtype=self.dtype, device=self.device))
+                    accumulate = []
+                    for i, top_patch in enumerate(selected_image_feature):
+                        if positional_information == '2d_before' and i != 0:
+                            top_patch += positional_encoding[selected_coord[i-1][0], selected_coord[i-1][1], :, :]
+                        aligned_image_feature = self.multi_modal_projector(top_patch)
+                        if positional_information == '2d_after' and i != 0:
+                            aligned_image_feature += positional_encoding[selected_coord[i-1][0], selected_coord[i-1][1], :, :]
+                        accumulate.append(aligned_image_feature)
+                        if i == 0:
+                            accumulate.append(self.get_input_embeddings()(self.tokenizer(', ', padding=False, truncation=False, max_length=None, return_tensors='pt')['input_ids'].to(device=self.device)[0, 1:]))
+                            continue
+                        if positional_information == 'explicit':
+                            accumulate.append(self.get_input_embeddings()(self.tokenizer(f' at {str(selected_coord[i-1])}, ', padding=False, truncation=False, max_length=None, return_tensors='pt')['input_ids'].to(device=self.device)[0, 1:]))
+                        else:
+                            accumulate.append(self.get_input_embeddings()(self.tokenizer(f', ', padding=False, truncation=False, max_length=None, return_tensors='pt')['input_ids'].to(device=self.device)[0, 1:]))
+                    multimodal_projector_features.append(torch.cat(accumulate, dim=0)) # dimension of (num_selected_patch * num_tokens-1 + num_selected_patch * sep_len - 1) -> (num_selected_patch * num_tokens - 1) as sep_len = 1
+                assert len(selected_image_features) == len(multimodal_projector_features), f"The number of selected image features and image features do not match. Dimension of selected image features: {len(selected_image_features)} and dimension of image features: {len(multimodal_projector_features)}."
+                # print(multimodal_projector_features[0].shape)
+                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                    multimodal_projector_features, inputs_embeds, input_ids, attention_mask, labels
+                )
+                if labels is None:
+                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
+            else:
+                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+                # generation with cache
+                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs[0]
+        batch_shift = 100
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                logits_shape = logits.shape
+                labels_shape = labels.shape
+                shift_attention_mask_shape = shift_attention_mask.shape
+                for i in range(0, shift_attention_mask.shape[1], batch_shift):
+                    shift_logits = logits[..., i:min(i+batch_shift, logits_shape[1]-1), :][shift_attention_mask[..., i:min(i+batch_shift, shift_attention_mask_shape[1])].to(logits.device) != 0].contiguous()
+                    shift_labels = labels[..., i+1:min(i+batch_shift+1, labels_shape[1])][shift_attention_mask[..., i:min(i+batch_shift, shift_attention_mask_shape[1])].to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return GeckoCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, keyword_prompt_input_ids=None, coords=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.image_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "keyword_prompt_input_ids": keyword_prompt_input_ids,
+                "coords": coords,
+                "topk": kwargs.get("topk"),
+                "vision_feature_select_strategy": kwargs.get("vision_feature_select_strategy"),
+                "vision_feature_layer": kwargs.get("vision_feature_layer"),
+                "patch_picking_strategy": kwargs.get("patch_picking_strategy"),
+                "keyword_criteria": kwargs.get("keyword_criteria"),
+                "positional_information": kwargs.get("positional_information"),
+                "visualize_patches": kwargs.get("visualize_patches"),
+                "visualize_topk_patches": kwargs.get("visualize_topk_patches"),
+                "print_keyword": kwargs.get("print_keyword"),
+                "print_topk_patches": kwargs.get("print_topk_patches"),
+            }
+        )
+        return model_inputs
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)

model/multimodal_encoder.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# This code is referenced from https://github.com/dhansmair/flamingo-mini
+import torch
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from torch import einsum, nn
+import math
+import torch.nn.functional as F
+from .configuration_gecko import GeckoConfig
+from transformers.activations import ACT2FN
+from torch.nn.init import trunc_normal_
+from functools import partial
+def feed_forward_layer(dim: int, mult: int = 4, activation: str = 'gelu'):
+    """Feed forward layer with given activation function"""
+    activations = dict(gelu=nn.GELU, relu=nn.ReLU)
+    assert activation in activations, f'activation can only be one of {activations.keys()}'
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        activations[activation](),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class PerceiverAttentionLayer(nn.Module):
+    """Perceiver Attention Layer"""
+    def __init__(self, dim: int, dim_head: int = 64, heads: int = 8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+        inner_dim = dim_head * heads
+        # trainable components of PerceiverAttentionLayer
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(dim, inner_dim, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, features, latents):
+        """Latent vectors are cross-attending to the visual features x
+        Args:
+            features: Batch of visual features with shape (batch_size, n_tokens, dim)
+            latents: Latent learnt vectors which are used to compute queries with shape (batch_size, n_latents, dim)
+        Returns:
+            Attention score with shape (batch_size, n_latents, dim)
+        """
+        assert features.ndim == 3
+        assert latents.ndim == 3
+        assert features.shape[0] == latents.shape[0]
+        assert features.shape[2] == latents.shape[2]
+        n_heads = self.heads
+        n_batch, n_features, dim = features.shape
+        n_queries = latents.shape[1]
+        # Layer normalization
+        x = self.norm_media(features)
+        latents = self.norm_latents(latents)
+        # Compute the queries from the latents, for all attention heads simultaneously
+        q = self.to_q(latents)
+        q = rearrange(q, 'b q (h d) -> b h q d', h=n_heads)
+        assert q.shape == torch.Size([n_batch, n_heads, n_queries, self.dim_head])
+        # Keys and values for all attention heads
+        kv_input = torch.cat((x, latents), dim=-2)
+        n_features_latents = n_features + n_queries
+        k = self.to_k(kv_input)
+        v = self.to_v(kv_input)
+        k, v = rearrange_many((k, v), 'b f (h d) -> b h f d', h=n_heads)
+        assert v.shape == torch.Size([n_batch, n_heads, n_features_latents, self.dim_head])
+        q = q * self.scale
+        # Attention scores
+        sim = einsum('b h q d, b h f d -> b h q f', q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        alphas = sim.softmax(dim=-1)
+        out = einsum('b h q f, b h f v -> b h q v', alphas, v)
+        out = rearrange(out, 'b h q v -> b q (h v)')
+        return self.to_out(out)
+class GeckoResamplerProjector(nn.Module):
+    """Perceiver Resampler with multi-head attention layer"""
+    def __init__(
+        self,
+        config: GeckoConfig,
+        num_queries: int = 64,
+        depth: int = 2,
+        dim_head: int = 32,
+        heads: int = 4,
+        ff_mult: int = 2,
+    ):
+        super().__init__()
+        self.dim = config.text_config.hidden_size
+        self.num_queries = num_queries
+        self.latents = nn.Parameter(torch.randn(self.num_queries, self.dim))  # type: ignore[reportPrivateUsage]
+        self.linear = nn.Linear(config.vision_config.hidden_size, self.dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttentionLayer(dim=self.dim, dim_head=dim_head, heads=heads),
+                        feed_forward_layer(dim=self.dim, mult=ff_mult, activation=config.projector_hidden_act),
+                    ]
+                )
+            )
+        # Layer normalization takes as input the query vector length
+        self.norm = nn.LayerNorm(self.dim)
+    def forward(self, x_f: torch.Tensor):
+        """Run perceiver resampler on the input visual embeddings
+        Args:
+            x_f: Input visual embeddings of shape (batch_size, num_tokens, d_visual)
+        Returns:
+            Resampler features of shape (batch_size, num_queries, d_visual)
+        """
+        assert x_f.ndim == 3
+        x_f = self.linear(x_f)
+        batch_size, num_tokens, dim = x_f.shape
+        assert dim == self.dim
+        # Copy the latents for every element in the batch
+        x = repeat(self.latents, 'q d -> b q d', b=batch_size)
+        # Apply attention and feed forward layer
+        for attn, ffw in self.layers:
+            x = x + attn(x_f, x)
+            x = x + ffw(x)
+        assert x.shape == torch.Size([batch_size, self.num_queries, self.dim])
+        norm = self.norm(x)
+        return norm
+class GeckoMLPProjector(nn.Module):
+    def __init__(self, config: GeckoConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states

model/processing_gecko.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import math
+from typing import List, Optional, Union, Dict
+import torch
+from PIL import Image
+import logging
+import os
+import json
+import re
+from transformers.feature_extraction_sequence_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers import ProcessorMixin, ImageProcessingMixin, AutoImageProcessor, AutoTokenizer, AutoProcessor
+from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+from transformers.processing_utils import transformers_module
+from transformers.utils.hub import is_remote_url, download_url, cached_file, is_offline_mode
+from transformers.utils import IMAGE_PROCESSOR_NAME
+logger = logging.getLogger(__name__)
+class GeckoProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("CLIPImageProcessor", "SiglipImageProcessor")
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast", "PreTrainedTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, use_keyword=False, crop_size=336, cropping_method='dynamic', **kwargs):
+        super().__init__(image_processor, tokenizer)
+        self.crop_size = crop_size if crop_size is not None else int(image_processor.size['height'])
+        self.use_keyword = use_keyword
+        self.image_token_index = None
+        self.cropping_method = cropping_method
+        self.load_clip_tokenizer()
+    def load_clip_tokenizer(self):
+        if 'clip' in self.image_processor.image_processor_type.lower():
+            self.clip_tokenizer = AutoTokenizer.from_pretrained('openai/clip-vit-large-patch14-336')
+        elif 'siglip' in self.image_processor.image_processor_type.lower():
+            self.clip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-so400m-patch14-384")
+        else:
+            raise ValueError(f"Invalid image processor type: {self.image_processor.image_processor_type}")
+    def process_images(self, images: List[Image.Image]):
+        # create documentation
+        """
+        Parameters:
+            images: List[Image.Image]
+                List of PIL images to be processed
+        Returns:
+            Dict[str, torch.Tensor]:
+                pixel_values: List[torch.Tensor]
+                    Pixel values of the images. Has shape (num_images, num_patches, num_channels, height, width)
+                coords: List[List[List[int]]]
+                    Coordinates of the cropped images. Has shape (num_images, num_patches, 2)
+                """
+        pixel_values = []
+        coords = []
+        for image in images:
+            outputs, coord = self.dynamic_preprocess(image)
+            pixel_values.append(outputs)
+            coords.append(coord)
+        return {"pixel_values": pixel_values, "coords": coords}
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
+        best_ratio_diff = float('inf')
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    def dynamic_preprocess(self, image):
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        if self.cropping_method == 'dynamic':
+            max_num = math.ceil(orig_width / self.crop_size) * math.ceil(orig_height / self.crop_size)
+            # calculate the existing image aspect ratio
+            target_ratios = set(
+                (i, j) for n in range(1, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+                i * j <= max_num and i * j >= 1)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = self.find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, self.crop_size)
+            # if target_aspect_ratio[0] * target_aspect_ratio[1] <= 25:
+            #     target_aspect_ratio = (int(1.5 * target_aspect_ratio[0]), int(1.5 * target_aspect_ratio[1]))
+        elif self.cropping_method == 'naive':
+            target_aspect_ratio = (orig_width // self.crop_size, orig_height // self.crop_size)
+            # print(target_aspect_ratio)
+            # if target_aspect_ratio[0] * target_aspect_ratio[1] <= 25:
+            #     target_aspect_ratio = (2 * orig_width // self.crop_size, 2 * orig_height // self.crop_size)
+            # print(target_aspect_ratio)
+        else:
+            raise ValueError(f"Invalid cropping method: {self.cropping_method}")
+        # calculate the target width and height
+        target_width = self.crop_size * target_aspect_ratio[0]
+        target_height = self.crop_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # add whole image
+        processed_images = []
+        processed_images.append(image.resize((self.crop_size, self.crop_size)))
+        coords = []
+        if blocks == 1:
+            return self.image_processor(images=processed_images, return_tensors='pt')['pixel_values'], coords
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        for i in range(blocks):
+            x0 = (i % (target_width // self.crop_size))
+            y0 = (i // (target_width // self.crop_size))
+            x1 = ((i % (target_width // self.crop_size)) + 1)
+            y1 = ((i // (target_width // self.crop_size)) + 1)
+            box = (
+                x0 * self.crop_size,
+                y0 * self.crop_size,
+                x1 * self.crop_size,
+                y1 * self.crop_size
+            )
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+            coords.append([x0, y0])
+            # box = (
+            #     (i % (target_width // self.crop_size)) * self.crop_size,
+            #     (i // (target_width // self.crop_size)) * self.crop_size,
+            #     ((i % (target_width // self.crop_size)) + 1) * self.crop_size,
+            #     ((i // (target_width // self.crop_size)) + 1) * self.crop_size
+            # )
+            # split the image
+        assert len(processed_images) == blocks + 1
+        return self.image_processor(images=processed_images, return_tensors='pt')['pixel_values'], coords
+    def preprocess_interleaved_images_and_text(
+        self,
+        text,
+        images=None,
+    ):
+        """
+        Args:
+            text (`str`, `List[str]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                text can contain <image> tokens as the placeholder for the image(s) to be inserted.
+            images (`PIL.Image.Image`, `List[PIL.Image.Image]`, `List[List[PIL.Image.Image]]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+                the number of the images should match the number of <image> tokens in the text.
+        """
+        assert text is not None, "text cannot be None."
+        if images is not None:
+            if isinstance(images, Image.Image):
+                images = [images]
+            if isinstance(images, list) and isinstance(images[0], Image.Image):
+                if isinstance(text, str):
+                    images = [images]
+                elif isinstance(text, list):
+                    if len(text) != len(images):
+                        raise ValueError("Invalid input text. Number of texts does not match number of images.")
+                    images = [[image] for image in images]
+            if isinstance(text, str):
+                num_images = len(images[0])
+                num_image_tokens = text.count("<image>")
+                if num_image_tokens < num_images:
+                    # prepend empty image tokens to text
+                    if "USER:" in text:
+                        text = text.replace("USER:", "USER:" + "<image>" * (num_images - num_image_tokens), 1)
+                    elif "Human:" in text:
+                        text = text.replace("Human:", "Human:" + "<image>" * (num_images - num_image_tokens), 1)
+                    elif "HUMAN:" in text:
+                        text = text.replace("HUMAN:", "HUMAN:" + "<image>" * (num_images - num_image_tokens), 1)
+                    else:
+                        text = "<image>" * (num_images - num_image_tokens) + text
+                    # logger.warning("Image Tokens <image> are not provided in the text. Automatically prepending them before the text. This might cause model to behave unexpectedly.")
+                elif num_image_tokens > num_images:
+                    text = text.split("<image>")
+                    for i, t in enumerate(text):
+                        if i < num_images:
+                            text[i] = t + "<image>"
+                    text = "".join(text)
+                    logger.warning(f"Number of <image> tokens: {num_image_tokens} exceeds number of images: {num_images}. Automatically removing extra tokens at the end of the text.")
+                    # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
+                texts = [text]
+            elif isinstance(text, list):
+                if not isinstance(text[0], str):
+                    raise ValueError("Invalid input text. Each element of text must be a string.")
+                for i, t in enumerate(text):
+                    num_image_tokens = t.count("<image>")
+                    num_images = len(images[i])
+                    if num_image_tokens < num_images:
+                        # prepend empty image tokens to text
+                        if "USER:" in t:
+                            t = t.replace("USER:", "USER:" + "<image>" * (num_images - num_image_tokens), 1)
+                        elif "Human:" in t:
+                            t = t.replace("Human:", "Human:" + "<image>" * (num_images - num_image_tokens), 1)
+                        elif "HUMAN:" in t:
+                            t = t.replace("HUMAN:", "HUMAN:" + "<image>" * (num_images - num_image_tokens), 1)
+                        else:
+                            t = "<image>" * (num_images - num_image_tokens) + t
+                        # logger.warning("Image Tokens <image> are not provided in the text. Automatically prepending them before the text. This might cause model to behave unexpectedly.")
+                    elif num_image_tokens > num_images:
+                        t = t.split("<image>")
+                        for j, s in enumerate(t):
+                            if j < num_images:
+                                t[j] = s + "<image>"
+                        t = "".join(t)
+                        logger.warning(f"Number of <image> tokens: {num_image_tokens} exceeds number of images: {num_images}. Automatically removing extra tokens at the end of the text.")
+                        # raise ValueError("Invalid input text. Number of <image> tokens exceeds number of images.")
+                    text[i] = t
+                texts = text
+            else:
+                raise ValueError("Invalid input text. text must be a string or a list of strings.")
+            assert all([t.count("<image>") == len(images_per_text) for t, images_per_text in zip(texts, images)]), "Number of <image> tokens in text does not match number of images."
+            # add image denotation in text before each <image> as "(image {i}: <image>)"
+            for i, t in enumerate(texts):
+                for j in range(len(images[i])):
+                    t = t.replace("<image>", f"(image {j+1}: <Image><IMAGE></Image>)", 1)
+                t = t.replace("<IMAGE>", "<image>")
+                texts[i] = t
+        else:
+            if isinstance(text, str):
+                texts = [text]
+            elif isinstance(text, list):
+                if not isinstance(text[0], str):
+                    raise ValueError("Invalid input text. Each element of text must be a string.")
+                texts = text
+            else:
+                raise ValueError("Invalid input text. text must be a string or a list of strings.")
+        return texts, images
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        keywords_text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        add_image_ids: bool = True,
+        cropping_method: str = None,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. Have shape of (num_images, num_patches, num_tokens, embed_dim)
+            - **coords** -- Coordinates of the cropped images. Returned when `images` is not `None`. Have shape of (num_images, num_patches, 2)
+        """
+        if cropping_method is not None:
+            self.cropping_method = cropping_method
+        if not self.image_token_index:
+            self.image_token_index = self.tokenizer.convert_tokens_to_ids("<image>")
+        if add_image_ids:
+            text, images = self.preprocess_interleaved_images_and_text(text, images)
+        text_inputs = self.tokenizer(
+            text,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+        )
+        if self.use_keyword and keywords_text is not None:
+            keywords_prompt_input_ids = self.tokenizer(keywords_text,
+                                                padding=padding,
+                                                truncation=truncation,
+                                                max_length=max_length,
+                                                return_tensors=return_tensors)['input_ids']
+        else:
+            keywords_prompt_input_ids = None
+        if images is not None:
+            input_ids = text_inputs["input_ids"]
+            num_image_tokens = torch.sum(input_ids == self.image_token_index, dim=-1)
+            for i, num_image_token in enumerate(num_image_tokens):
+                if num_image_token < len(images[i]):
+                    images[i] = images[i][:num_image_token]
+                    print(f"{len(images[i]) - num_image_token} ({len(images[i])} in total) image tokens in the text are truncated due to the max sequence length; removing the extra images.")
+            # flatten images
+            images = [image for images_per_text in images for image in images_per_text]
+            image_inputs = self.process_images(images)
+        else:
+            image_inputs = {"pixel_values": None, "coords": None}
+        return BatchFeature(data={**text_inputs, **image_inputs, "keyword_prompt_input_ids": keywords_prompt_input_ids})
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def _right_pad_inputs_with_attention_mask(self, model_inputs: List[Dict]):
+        results = {}
+        assert len(model_inputs) == 1, "This method only supports a single input, but get {} inputs".format(len(model_inputs))
+        for k in model_inputs[0].keys():
+            if k == "pixel_values" or k == "coords":
+                results[k] = model_inputs[0][k] if model_inputs[0][k] is not None else None
+            else:
+                results[k] = torch.cat([model_inputs[0][k]], dim=0) if model_inputs[0][k] is not None else None
+        return results
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+        user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_processor_file = pretrained_model_name_or_path
+            is_local = True
+        elif is_remote_url(pretrained_model_name_or_path):
+            processor_file = pretrained_model_name_or_path
+            resolved_processor_file = download_url(pretrained_model_name_or_path)
+        else:
+            processor_file = IMAGE_PROCESSOR_NAME
+            try:
+                # Load from local folder or from cache or download from model Hub and cache
+                resolved_processor_file = cached_file(
+                    pretrained_model_name_or_path,
+                    processor_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=True,
+                )
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {IMAGE_PROCESSOR_NAME} file"
+                )
+        # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not
+        # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict.
+        # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception)
+        # However, for models added in the future, we won't get the expected error if this file is missing.
+        if resolved_processor_file is None:
+            image_processor_dict = {}
+        try:
+            # Load processor dict
+            with open(resolved_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
+            )
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            if isinstance(class_name, tuple):
+                if attribute_name == "tokenizer":
+                    classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+                    use_fast = kwargs.get("use_fast", True)
+                    if use_fast and classes[1] is not None:
+                        attribute_class = classes[1]
+                    else:
+                        attribute_class = classes[0]
+                elif attribute_name == "image_processor":
+                    image_processor_type = image_processor_dict.get("image_processor_type", None)
+                    if image_processor_type is not None:
+                        assert image_processor_type in class_name, f"Invalid image processor type: {image_processor_type}"
+                        attribute_class = getattr(transformers_module, image_processor_type)
+                    else:
+                        attribute_class = getattr(transformers_module, class_name[0])
+                else:
+                    raise ValueError(f"Invalid attribute name: {attribute_name}")
+            else:
+                attribute_class = getattr(transformers_module, class_name)
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return args

model/utils.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import PIL
+import torch
+from .modelling_gecko import GeckoForConditionalGeneration
+from .processing_gecko import GeckoProcessor
+from .conversation import conv_llama_3 as default_conv, conv_templates
+import transformers
+from typing import List, Tuple, Union
+from io import StringIO
+import sys
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        return self
+    def __exit__(self, *args):
+        self.extend(self._stringio.getvalue().splitlines())
+        del self._stringio    # free up some memory
+        sys.stdout = self._stdout
+def chat_gecko(
+    text:str,
+    images: List[Union[PIL.Image.Image, str]],
+    model:GeckoForConditionalGeneration,
+    processor:GeckoProcessor,
+    max_input_length:int=None,
+    history:List[dict]=None,
+    **kwargs) -> Tuple[str, List[dict]]:
+    if "llama-3" in model.language_model.name_or_path.lower():
+        conv = conv_templates['llama_3']
+        terminators = [
+            processor.tokenizer.eos_token_id,
+            processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    else:
+        conv = default_conv
+        terminators = None
+    kwargs["eos_token_id"] = terminators
+    conv = conv.copy()
+    conv.messages = []
+    if history is not None:
+        for message in history:
+            assert message["role"] in conv.roles
+            conv.append_message(message["role"], message["text"])
+        if text:
+            assert conv.messages[-1][0] == conv.roles[1], "The last message in the history should be the assistant, if the given text is not empty"
+            conv.append_message(conv.roles[0], text)
+            conv.append_message(conv.roles[1], "")
+            history.append({"role": conv.roles[0], "text": text})
+            history.append({"role": conv.roles[1], "text": ""})
+        else:
+            if conv.messages[-1][0] == conv.roles[1]:
+                assert conv.messages[-1][1] == "", "No user message should be provided"
+            else:
+                assert conv.messages[-1][0] == conv.roles[0], "The last message in the history should be the user, if the given text is empty"
+                conv.append_message(conv.roles[0], "")
+                history.append({"role": conv.roles[0], "text": ""})
+    else:
+        history = []
+        history.append({"role": conv.roles[0], "text": text})
+        history.append({"role": conv.roles[1], "text": ""})
+        conv.append_message(conv.roles[0], text)
+        conv.append_message(conv.roles[1], "")
+    assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == "", "Format check"
+    assert history[-1]["role"] == conv.roles[1] and history[-1]["text"] == "", "Format check"
+    keyword_prompt = conv.generate_keyword_prompt(text.split("\n")[len(images)])
+    prompt = conv.get_prompt()
+    if images:
+        for i in range(len(images)):
+            if isinstance(images[i], str):
+                images[i] = PIL.Image.open(images[i]).convert("RGB")
+    inputs = processor(images=images, text=prompt, keywords_text=keyword_prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    for k, v in inputs.items():
+        if v is not None:
+            if isinstance(v, torch.Tensor):
+                inputs[k] = v.to(model.device)
+            elif isinstance(v, list):
+                if k == 'coords':
+                    continue
+                inputs[k] = [x.to(model.device) for x in v]
+            elif isinstance(v, transformers.tokenization_utils_base.BatchEncoding) or isinstance(v, dict):
+                for key, value in v.items():
+                    if value is not None:
+                        if isinstance(value, list):
+                            inputs[k][key] = [x.to(model.device) for x in value]
+                        else:
+                            inputs[k][key] = value.to(model.device)
+            else:
+                raise ValueError(f"Invalid input type: {type(v)}")
+    with torch.inference_mode():
+        output_ids = model.generate(**inputs, **kwargs)[0]
+    # remove the input tokens
+    generated_ids = output_ids[inputs["input_ids"].shape[-1]:]
+    generated_text = processor.decode(generated_ids, skip_special_tokens=True)
+    history[-1]["text"] = generated_text
+    return generated_text, history
+def chat_gecko_stream(
+    text:str,
+    images: List[Union[PIL.Image.Image, str]],
+    model:GeckoForConditionalGeneration,
+    processor:GeckoProcessor,
+    max_input_length:int=None,
+    history:List[dict]=None,
+    **kwargs) -> Tuple[str, List[dict]]:
+    if "llama-3" in model.language_model.name_or_path.lower():
+        conv = conv_templates['llama_3']
+        terminators = [
+            processor.tokenizer.eos_token_id,
+            processor.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    else:
+        conv = default_conv
+        terminators = None
+    kwargs["eos_token_id"] = terminators
+    conv = conv.copy()
+    conv.messages = []
+    if history is not None:
+        for message in history:
+            assert message["role"] in conv.roles
+            conv.append_message(message["role"], message["text"])
+        if text:
+            assert conv.messages[-1][0] == conv.roles[1], "The last message in the history should be the assistant, if the given text is not empty"
+            conv.append_message(conv.roles[0], text)
+            conv.append_message(conv.roles[1], "")
+            history.append({"role": conv.roles[0], "text": text})
+            history.append({"role": conv.roles[1], "text": ""})
+        else:
+            if conv.messages[-1][0] == conv.roles[1]:
+                assert conv.messages[-1][1] == "", "No user message should be provided"
+            else:
+                assert conv.messages[-1][0] == conv.roles[0], "The last message in the history should be the user, if the given text is empty"
+                conv.append_message(conv.roles[0], "")
+                history.append({"role": conv.roles[0], "text": ""})
+    else:
+        history = []
+        history.append({"role": conv.roles[0], "text": text})
+        history.append({"role": conv.roles[1], "text": ""})
+        conv.append_message(conv.roles[0], text)
+        conv.append_message(conv.roles[1], "")
+    assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == "", "Format check"
+    assert history[-1]["role"] == conv.roles[1] and history[-1]["text"] == "", "Format check"
+    if images:
+        for i in range(len(images)):
+            if isinstance(images[i], str):
+                images[i] = PIL.Image.open(images[i])
+        last_prompt = history[-2]['text'].split("?")[0]
+        last_prompt = last_prompt.replace('<image>', '').strip() if '<image>' in last_prompt else last_prompt.strip()
+        keyword_prompt = conv.generate_keyword_prompt(last_prompt.replace('<image>', '').strip()) if '<image>' in last_prompt else conv.generate_keyword_prompt(last_prompt.strip())
+    else:
+        keyword_prompt = None
+    prompt = conv.get_prompt()
+    inputs = processor(images=images, text=prompt, keywords_text=keyword_prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    for k, v in inputs.items():
+        if v is not None:
+            if isinstance(v, torch.Tensor):
+                inputs[k] = v.to(model.device)
+            elif isinstance(v, list):
+                if k == 'coords':
+                    continue
+                inputs[k] = [x.to(model.device) for x in v]
+            elif isinstance(v, transformers.tokenization_utils_base.BatchEncoding) or isinstance(v, dict):
+                for key, value in v.items():
+                    if value is not None:
+                        if isinstance(value, list):
+                            inputs[k][key] = [x.to(model.device) for x in value]
+                        else:
+                            inputs[k][key] = value.to(model.device)
+            else:
+                raise ValueError(f"Invalid input type: {type(v)}")
+    from transformers import TextIteratorStreamer
+    from threading import Thread
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    kwargs["streamer"] = streamer
+    inputs.update(kwargs)
+    thread = Thread(target=model.generate, kwargs=inputs)
+    thread.start()
+    generator = []
+    with Capturing() as print_kw:
+        for _output in streamer:
+            history[-1]["text"] += _output
+            generator.append((history[-1]["text"], history))
+            # yield history[-1]["text"], history
+    return generator, print_kw, inputs