vlm

Runtime error

App Files Files Community

cagataydag

ziqingyang commited on Apr 10, 2023

Commit

e9f3e5c

0 Parent(s):

Duplicate from hfl/VQA_VLE_LLM

Browse files

Co-authored-by: Ziqing Yang <[email protected]>

Files changed (22) hide show

.gitattributes +38 -0
README.md +14 -0
app.py +245 -0
models/VLE/__init__.py +11 -0
models/VLE/__pycache__/__init__.cpython-39.pyc +0 -0
models/VLE/__pycache__/configuration_vle.cpython-39.pyc +0 -0
models/VLE/__pycache__/modeling_vle.cpython-39.pyc +0 -0
models/VLE/__pycache__/pipeline_vle.cpython-39.pyc +0 -0
models/VLE/__pycache__/processing_vle.cpython-39.pyc +0 -0
models/VLE/configuration_vle.py +143 -0
models/VLE/modeling_vle.py +709 -0
models/VLE/pipeline_vle.py +166 -0
models/VLE/processing_vle.py +149 -0
pics/birds.jpg +0 -0
pics/chicking.jpg +0 -0
pics/dogs.png +0 -0
pics/fish.jpg +3 -0
pics/horses.jpg +3 -0
pics/men.jpg +0 -0
pics/tower.jpg +0 -0
pics/traffic.jpg +0 -0
requirements.txt +4 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+qa9.jpg filter=lfs diff=lfs merge=lfs -text
+upload4.jpg filter=lfs diff=lfs merge=lfs -text
+pics/horses.jpg filter=lfs diff=lfs merge=lfs -text
+pics/fish.jpg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: VQA with VLE and LLM
+emoji: 📚
+colorFrom: gray
+colorTo: blue
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+license: openrail
+duplicated_from: hfl/VQA_VLE_LLM
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import string
+import gradio as gr
+import requests
+import torch
+from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
+from PIL import Image
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+print("device:",device)
+model_name="hfl/vle-base-for-vqa"
+model = VLEForVQA.from_pretrained(model_name)
+vle_processor = VLEProcessor.from_pretrained(model_name)
+vqa_pipeline = VLEForVQAPipeline(model=model, device=device, vle_processor=vle_processor)
+from transformers import BlipProcessor, BlipForConditionalGeneration
+cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+print("cap_model device:",cap_model.device)
+cap_model.to(device)
+print("cap_model device:",cap_model.device)
+def caption(input_image):
+    inputs = cap_processor(input_image, return_tensors="pt").to(device)
+    # inputs["num_beams"] = 1            #  no num_beams use greedy search
+    # inputs['num_return_sequences'] =1
+    out = cap_model.generate(**inputs)
+    return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True))
+import openai
+import os
+openai.api_key= os.getenv('openai_appkey')
+def gpt3_short(question,vqa_answer,caption):
+    vqa_answer,vqa_score=vqa_answer
+    prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+", socre: "+f"{vqa_score[0]:.2f}"+\
+           "; B: "+vqa_answer[1]+", score: "+f"{vqa_score[1]:.2f}"+"; C: "+vqa_answer[2]+", score: "+f"{vqa_score[2]:.2f}"+\
+            "; D: "+vqa_answer[3]+", score: "+f"{vqa_score[3]:.2f}"+\
+           ". Choose A if A is not in conflict with the description of the picture, otherwise A might be incorrect, and choose the B, C or D based on the description. Answer with A or B or C or D."
+    # prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
+    response = openai.Completion.create(
+    engine="text-davinci-003",
+    prompt=prompt,
+    max_tokens=30,
+    n=1,
+    stop=None,
+    temperature=0.7,
+    )
+    answer = response.choices[0].text.strip()
+    llm_ans=answer
+    choice=set(["A","B","C","D"])
+    llm_ans=llm_ans.replace("\n"," ").replace(":"," ").replace("."," " ).replace(","," ")
+    sllm_ans=llm_ans.split(" ")
+    for cho in sllm_ans:
+      if cho in choice:
+         llm_ans=cho
+         break
+    if llm_ans not in choice:
+        llm_ans="A"
+    llm_ans=vqa_answer[ord(llm_ans)-ord("A")]
+    answer=llm_ans
+    return answer
+def gpt3_long(question,vqa_answer,caption):
+    vqa_answer,vqa_score=vqa_answer
+    # prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+"socre:"+str(vqa_score[0])+\
+    #        " B: "+vqa_answer[1]+" score:"+str(vqa_score[1])+" C: "+vqa_answer[2]+" score:"+str(vqa_score[2])+\
+    #         " D: "+vqa_answer[3]+'score:'+str(vqa_score[3])+\
+    #        "Tell me the right answer with a long sentence."
+    prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+", socre:"+f"{vqa_score[0]:.2f}"+\
+           ";   "+vqa_answer[1]+", score:"+f"{vqa_score[1]:.2f}"+";  "+vqa_answer[2]+", score:"+f"{vqa_score[2]:.2f}"+\
+            ";  "+vqa_answer[3]+', score:'+f"{vqa_score[3]:.2f}"+\
+           ". Answer the question with a sentence without mentioning the VQA model and the score."
+    # prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+" socre:"+str(vqa_score[0])+\
+    #        "   "+vqa_answer[1]+" score:"+str(vqa_score[1])+"  "+vqa_answer[2]+" score:"+str(vqa_score[2])+\
+    #         "  "+vqa_answer[3]+'score:'+str(vqa_score[3])+\
+    #        "Tell me the right answer with a long sentence."
+    # prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
+    response = openai.Completion.create(
+    engine="text-davinci-003",
+    prompt=prompt,
+    max_tokens=50,
+    n=1,
+    stop=None,
+    temperature=0.7,
+    )
+    answer = response.choices[0].text.strip()
+    return answer
+def gpt3(question,vqa_answer,caption):
+    prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
+    response = openai.Completion.create(
+    engine="text-davinci-003",
+    prompt=prompt,
+    max_tokens=50,
+    n=1,
+    stop=None,
+    temperature=0.7,
+    )
+    answer = response.choices[0].text.strip()
+    # return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer
+    return answer
+def vle(input_image,input_text):
+    vqa_answers = vqa_pipeline({"image":input_image, "question":input_text}, top_k=4)
+    # return [" ".join([str(value) for key,value in vqa.items()] )for vqa in vqa_answers]
+    return [vqa['answer'] for vqa in vqa_answers],[vqa['score'] for vqa in vqa_answers]
+def inference_chat(input_image,input_text):
+    input_text=input_text[:200]
+    input_text=" ".join(input_text.split(" ")[:60])
+    cap=caption(input_image)
+    # inputs = processor(images=input_image, text=input_text,return_tensors="pt")
+    # inputs["max_length"] = 10
+    # inputs["num_beams"] = 5
+    # inputs['num_return_sequences'] =4
+    # out = model_vqa.generate(**inputs)
+    # out=processor.batch_decode(out, skip_special_tokens=True)
+    print("Caption:",cap)
+    out=vle(input_image,input_text)
+    print("VQA: ",out)
+    # vqa="\n".join(out[0])
+    # gpt3_out=gpt3(input_text,vqa,cap)
+    gpt3_out=gpt3_long(input_text,out,cap)
+    # gpt3_out1=gpt3_short(input_text,out,cap)
+    return out[0][0], gpt3_out #,gpt3_out1
+title = """<h1 align="center">VQA with VLE and LLM</h1>"""
+# description = """We demonstrate three visual question answering systems built with VLE and LLM:
+# 1. VQA: The image and the question are fed into a VQA model (VLEForVQA) and the model predicts the answer.
+# 2. VQA+LLM: The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to generate the most reasonable answer.
+# The outptus from VQA+LLM may vary due to the decoding strategy of LLM. For more details about VLE and the VQA pipeline, see [http://vle.hfl-rc.com](http://vle.hfl-rc.com)"""
+description_main="""**VLE** (Vision-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See [https://github.com/iflytek/VLE](https://github.com/iflytek/VLE) for more details.
+We demonstrate visual question answering systems built with VLE and LLM."""
+description_detail="""**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
+**VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
+with gr.Blocks(
+    css="""
+    .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
+    #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
+    """
+) as iface:
+    state = gr.State([])
+    #caption_output = None
+    gr.Markdown(title)
+    gr.Markdown(description_main)
+    #gr.Markdown(article)
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil",label="VQA Image Input")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    chat_input = gr.Textbox(lines=1, label="VQA Question Input")
+                    with gr.Row():
+                        # clear_button = gr.Button(value="Clear", interactive=True)
+                        submit_button = gr.Button(
+                            value="Submit", interactive=True, variant="primary"
+                        )
+                        '''
+                    cap_submit_button = gr.Button(
+                            value="Submit_CAP", interactive=True, variant="primary"
+                        )
+                    gpt3_submit_button = gr.Button(
+                            value="Submit_GPT3", interactive=True, variant="primary"
+                        )
+                        '''
+        with gr.Column():
+            gr.Markdown(description_detail)
+            caption_output = gr.Textbox(lines=0, label="VQA ")
+            gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM")
+        # image_input.change(
+        #     lambda: ("", [],"","",""),
+        #     [],
+        #     [ caption_output, state,caption_output,gpt3_output_v1,caption_output_v1],
+        #     queue=False,
+        # )
+        chat_input.submit(
+                    inference_chat,
+                    [
+                        image_input,
+                        chat_input,
+                    ],
+                    [ caption_output,gpt3_output_v1],
+                )
+        # clear_button.click(
+        #                 lambda: ("", [],"","",""),
+        #                 [],
+        #                 [chat_input,  state,caption_output,gpt3_output_v1,caption_output_v1],
+        #                 queue=False,
+        #             )
+        submit_button.click(
+                        inference_chat,
+                        [
+                            image_input,
+                            chat_input,
+                        ],
+                        [caption_output,gpt3_output_v1],
+                    )
+        '''
+        cap_submit_button.click(
+                        caption,
+                        [
+                            image_input,
+                        ],
+                        [caption_output_v1],
+                    )
+        gpt3_submit_button.click(
+                        gpt3,
+                        [
+                            chat_input,
+                           caption_output ,
+                            caption_output_v1,
+                        ],
+                        [gpt3_output_v1],
+                    )
+        '''
+    examples=[['pics/men.jpg',"How many people are there?","3","There are two people in the picture: a man and the driver of the truck."],
+              ['pics/dogs.png',"Where are the huskies?","on grass","The huskies are sitting on the grass."],
+              ['pics/horses.jpg',"What are the horses doing?",'walking','The horses are walking and pulling a sleigh through the snow.'],
+              ['pics/fish.jpg',"What is in the man's hand?","fish","The man in the hat is holding a fishing pole."],
+              ['pics/tower.jpg',"Where is the photo taken?","paris","The photo appears to have been taken in Paris, near the Eiffel Tower."],
+              ['pics/traffic.jpg',"What is this man doing?","looking","The man appears to be looking around the street."],
+              ['pics/chicking.jpg',"What did this animal hatch from?","farm","The animal likely hatched from a farm, ground, tree, or nest."]
+             ]
+    examples = gr.Examples(
+       examples=examples,inputs=[image_input, chat_input,caption_output,gpt3_output_v1],
+    )
+iface.queue(concurrency_count=1, api_open=False, max_size=10)
+iface.launch(enable_queue=True)

models/VLE/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .modeling_vle import (
+    VLEModel,
+    VLEForVQA,
+    VLEForITM,
+    VLEForMLM,
+    VLEForPBC
+)
+from .configuration_vle import VLEConfig
+from .processing_vle import VLEProcessor
+from .pipeline_vle import VLEForVQAPipeline, VLEForITMPipeline, VLEForPBCPipeline

models/VLE/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (498 Bytes). View file

models/VLE/__pycache__/configuration_vle.cpython-39.pyc ADDED Viewed

Binary file (4.27 kB). View file

models/VLE/__pycache__/modeling_vle.cpython-39.pyc ADDED Viewed

Binary file (18.5 kB). View file

models/VLE/__pycache__/pipeline_vle.cpython-39.pyc ADDED Viewed

Binary file (6.38 kB). View file

models/VLE/__pycache__/processing_vle.cpython-39.pyc ADDED Viewed

Binary file (6.16 kB). View file

models/VLE/configuration_vle.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# coding=utf-8
+# Copyright The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" VLE model configuration"""
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.clip.configuration_clip import CLIPVisionConfig
+from typing import Union, Dict
+logger = logging.get_logger(__name__)
+class VLEConfig(PretrainedConfig):
+    r"""
+    [`VLEConfig`] is the configuration class to store the configuration of a
+    [`VLEModel`]. It is used to instantiate [`VLEModel`] model according to the
+    specified arguments, defining the text model and vision model configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`):
+            Dictionary of configuration options that defines text model config.
+        vision_config (`dict`):
+            Dictionary of configuration options that defines vison model config.
+        #TODO
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Examples:
+    ```python
+    >>> from transformers import ViTConfig, BertConfig
+    >>> from configuration_vle import VLEconfig
+    >>> from modeling_vle import VLEModel
+    >>> # Initializing a BERT and ViT configuration
+    >>> config_vision = ViTConfig()
+    >>> config_text = BertConfig()
+    >>> config = VLEConfig.from_vision_text_configs(config_vision, config_text) #TODO
+    >>> # Initializing a BERT and ViT model (with random weights)
+    >>> model = VLEModel(config=config)
+    >>> # Accessing the model configuration
+    >>> config_vision = model.config.vision_config
+    >>> config_text = model.config.text_config
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("vit-bert")
+    >>> # loading model and config from pretrained folder
+    >>> vision_text_config = VLEConfig.from_pretrained("vit-bert")
+    >>> model = VLEModel.from_pretrained("vit-bert", config=vision_text_config)
+    ```"""
+    model_type = "vle"
+    is_composition = True
+    def __init__(
+        self,
+        text_config: Union[PretrainedConfig, Dict],
+        vision_config: Union[PretrainedConfig, Dict],
+        num_token_types=2,
+        hidden_size=768,
+        num_hidden_layers=6,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout=None,
+        **kwargs):
+        super().__init__(**kwargs)
+        if not isinstance(text_config,PretrainedConfig):
+            text_model_type = text_config.pop('model_type')
+            text_config = AutoConfig.for_model(text_model_type, **text_config)
+        self.text_config = text_config
+        if not isinstance(vision_config, PretrainedConfig):
+            vision_model_type = vision_config.pop('model_type')
+            if vision_model_type == "clip":
+                vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config
+            elif vision_model_type == "clip_vision_model":
+                vision_config = CLIPVisionConfig(**vision_config)
+            else:
+                vision_config = AutoConfig.for_model(vision_model_type, **vision_config)
+            self.vision_config = vision_config
+        else:
+            vision_model_type = vision_config.model_type
+            if vision_model_type== "clip":
+                vision_config = vision_config.vision_config
+            self.vision_config = vision_config
+        # co-attention
+        self.num_token_types=num_token_types
+        self.hidden_size=hidden_size
+        self.num_hidden_layers=num_hidden_layers
+        self.num_attention_heads=num_attention_heads
+        self.intermediate_size=intermediate_size
+        self.hidden_act=hidden_act
+        self.hidden_dropout_prob=hidden_dropout_prob
+        self.attention_probs_dropout_prob=attention_probs_dropout_prob
+        self.initializer_range=initializer_range
+        self.layer_norm_eps=layer_norm_eps
+        self.classifier_dropout=classifier_dropout
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

models/VLE/modeling_vle.py ADDED Viewed

	@@ -0,0 +1,709 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch VLE model."""
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, ModelOutput
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.modeling_auto import AutoModel
+from transformers.models.bert.modeling_bert import BertAttention, BertIntermediate, BertOutput, apply_chunking_to_forward
+from transformers.models.clip.modeling_clip import CLIPOutput, CLIPVisionConfig, CLIPVisionModel
+from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2OnlyMLMHead
+from .configuration_vle import VLEConfig
+from dataclasses import dataclass
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "VLEConfig"
+@dataclass
+class VLEModelOutput(ModelOutput):
+    pooler_output: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+@dataclass
+class VLEForITMOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+@dataclass
+class VLEForPBCOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+@dataclass
+class VLEForMLMOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+@dataclass
+class VLEForVQAOutput(ModelOutput):
+    loss : torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+class ITMHead(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, 2)
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+def extend_position_embedding(state_dict, patch_size, after):
+    """
+    modify state_dict in-place for longer position embeddings
+    """
+    keys = {}
+    for k,v in state_dict.items():
+        if k.endswith('vision_model.embeddings.position_embedding.weight'):
+            assert k not in keys
+            keys['pe'] = (k,v)
+        if k.endswith('vision_model.embeddings.position_ids'):
+            assert k not in keys
+            keys['pi'] = (k,v)
+    pe_weight = keys['pe'][1]
+    position_length_before = pe_weight.shape[0]
+    embed_dim = pe_weight.shape[1]
+    grid_before = position_length_before - 1
+    position_length_after = (after // patch_size) ** 2 + 1
+    grid_after = position_length_after - 1
+    new_pe_weight = pe_weight[1:].reshape((grid_before,grid_before,-1))
+    new_pe_weight =  torch.nn.functional.interpolate(
+        new_pe_weight.permute(2,0,1).unsqueeze(0),
+        size = (grid_after,grid_after), mode = 'bicubic')
+    new_pe_weight = new_pe_weight.squeeze(0).permute(1,2,0).reshape(grid_after*grid_after, -1)
+    new_pe_weight = torch.cat((pe_weight[0:1],new_pe_weight), dim=0)
+    assert new_pe_weight.shape == (grid_after*grid_after + 1, embed_dim)
+    state_dict[keys['pe'][0]] = new_pe_weight
+    state_dict[keys['pi'][0]] = torch.arange(grid_after*grid_after + 1).unsqueeze(0)
+    return state_dict
+class Pooler(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertCrossLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        self.crossattention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = None #past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask=None,
+            output_attentions=output_attentions,
+            past_key_value=None,
+        )
+        attention_output = self_attention_outputs[0]
+        # if decoder, the last output is tuple of self-attn cache
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        cross_attn_present_key_value = None
+        cross_attention_outputs = self.crossattention(
+            attention_output,
+            attention_mask,
+            None,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            None,
+            output_attentions,
+        )
+        attention_output = cross_attention_outputs[0]
+        outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class VLEPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization.
+    """
+    config_class = VLEConfig
+    base_model_prefix = "vle"
+    supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    ''' TODO checkpointing
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+    '''
+class VLEModel(VLEPreTrainedModel):
+    def __init__(
+        self,
+        config: Optional[VLEConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        if config is None and (vision_model is None or text_model is None):
+            raise ValueError("Either a configuration or an vision and a text model has to be provided")
+        if config is None:
+            config = VLEConfig(vision_model.config, text_model.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"config: {config} has to be of type {self.config_class}")
+        # initialize with config
+        super().__init__(config)
+        if vision_model is None:
+            if isinstance(config.vision_config, CLIPVisionConfig):
+                vision_model = CLIPVisionModel(config.vision_config)
+            else:
+                vision_model = AutoModel.from_config(config.vision_config)
+        if text_model is None:
+            text_model = AutoModel.from_config(config.text_config)
+        self.vision_model = vision_model
+        self.text_model = text_model
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.vision_model.config = self.config.vision_config
+        self.text_model.config = self.config.text_config
+        self.vision_embed_dim = config.vision_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
+        self.coattention_dim = config.hidden_size
+        # add projection layers
+        self.text_projection_layer = nn.Linear(self.text_embed_dim, self.coattention_dim)
+        self.image_projection_layer = nn.Linear(self.vision_embed_dim, self.coattention_dim)
+        #self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
+        self.token_type_embeddings = nn.Embedding(config.num_token_types, config.hidden_size)
+        self.cross_modal_image_layers = nn.ModuleList([BertCrossLayer(config) for _ in range(config.num_hidden_layers)])
+        self.cross_modal_text_layers = nn.ModuleList([BertCrossLayer(config) for _ in range(config.num_hidden_layers)])
+        self.cross_modal_image_pooler = Pooler(config.hidden_size)
+        self.cross_modal_text_pooler = Pooler(config.hidden_size)
+        # Initialize weights and apply final processing
+        self.token_type_embeddings.apply(self._init_weights)
+        self.cross_modal_image_layers.apply(self._init_weights)
+        self.cross_modal_text_layers.apply(self._init_weights)
+        self.cross_modal_image_pooler.apply(self._init_weights)
+        self.cross_modal_text_pooler.apply(self._init_weights)
+        if hasattr(self,"text_projection_layer"):
+            self.text_projection_layer.apply(self._init_weights)
+        if hasattr(self,"image_projection_layer"):
+            self.image_projection_layer.apply(self._init_weights)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        patch_ids = None,
+        return_loss: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], VLEModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            return_dict=return_dict,
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            return_dict=return_dict,
+        )
+        image_embeds = self.vision_model.vision_model.post_layernorm(vision_outputs[0])  # last_hidden_state
+        image_embeds = self.image_projection_layer(image_embeds)
+        text_embeds = text_outputs[0]  # last_hidden_state
+        text_embeds = self.text_projection_layer(text_embeds)
+        if patch_ids is not None:
+            raise NotImplementedError #TODO
+        image_masks = torch.ones((image_embeds.size(0), image_embeds.size(1)), dtype=torch.long, device=image_embeds.device)
+        extend_image_masks = self.text_model.get_extended_attention_mask(image_masks, image_masks.size())
+        image_embeds = image_embeds + self.token_type_embeddings(torch.full_like(image_masks, 1)) # image_token_type_idx=1 TODO use_vcr_token_type_embedding
+        extend_text_masks = self.text_model.get_extended_attention_mask(attention_mask, attention_mask.size())
+        text_embeds = text_embeds  + self.token_type_embeddings(torch.zeros_like(attention_mask))
+        x, y = text_embeds, image_embeds
+        for text_layer, image_layer in zip(self.cross_modal_text_layers, self.cross_modal_image_layers):
+            x1 = text_layer(x, y, extend_text_masks, extend_image_masks)
+            y1 = image_layer(y, x, extend_image_masks, extend_text_masks)
+            x, y = x1[0], y1[0]
+        text_embeds, image_embeds = x, y
+        text_pooler_output = self.cross_modal_text_pooler(x)
+        image_pooler_output =  self.cross_modal_image_pooler(y)
+        pooler_output = torch.cat([text_pooler_output, image_pooler_output], dim=-1)
+        if not return_dict:
+            output = (pooler_output, text_embeds, image_embeds)
+            return output
+        return VLEModelOutput(
+            pooler_output = pooler_output,
+            text_embeds = text_embeds,
+            image_embeds = image_embeds
+        )
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        # At the moment fast initialization is not supported
+        # for composite models
+        kwargs["_fast_init"] = False
+        return super().from_pretrained(*args, **kwargs)
+    @classmethod
+    def from_vision_text_pretrained(
+        cls,
+        vision_model_name_or_path: str = None,
+        text_model_name_or_path: str = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        kwargs_vision = {
+            argument[len("vision_") :]: value for argument, value in kwargs.items() if argument.startswith("vision_")
+        }
+        kwargs_text = {
+            argument[len("text_") :]: value for argument, value in kwargs.items() if argument.startswith("text_")
+        }
+        # remove vision, text kwargs from kwargs
+        for key in kwargs_vision.keys():
+            del kwargs["vision_" + key]
+        for key in kwargs_text.keys():
+            del kwargs["text_" + key]
+        # Load and initialize the vision and text model
+        vision_model = kwargs_vision.pop("model", None)
+        if vision_model is None:
+            if vision_model_name_or_path is None:
+                raise ValueError(
+                    "If `vision_model` is not defined as an argument, a `vision_model_name_or_path` has to be defined"
+                )
+            if "config" not in kwargs_vision:
+                vision_config = AutoConfig.from_pretrained(vision_model_name_or_path)
+            if vision_config.model_type == "clip":
+                kwargs_vision["config"] = vision_config.vision_config
+                vision_model = CLIPVisionModel.from_pretrained(vision_model_name_or_path, *model_args, **kwargs_vision)
+            else:
+                kwargs_vision["config"] = vision_config
+                vision_model = AutoModel.from_pretrained(vision_model_name_or_path, *model_args, **kwargs_vision)
+        text_model = kwargs_text.pop("model", None)
+        if text_model is None:
+            if text_model_name_or_path is None:
+                raise ValueError(
+                    "If `text_model` is not defined as an argument, a `text_model_name_or_path` has to be defined"
+                )
+            if "config" not in kwargs_text:
+                text_config = AutoConfig.from_pretrained(text_model_name_or_path)
+                kwargs_text["config"] = text_config
+            text_model = AutoModel.from_pretrained(text_model_name_or_path, *model_args, **kwargs_text)
+        # instantiate config with corresponding kwargs
+        config = VLEConfig(vision_model.config, text_model.config, **kwargs)
+        # init model
+        model = cls(config=config, vision_model=vision_model, text_model=text_model)
+        # the projection layers are always newly initialized when loading the model
+        # using pre-trained vision and text model.
+        logger.warning(
+            "The coattention layers and projection layers are newly initialized. You should probably TRAIN this model on a down-stream task to be"
+            " able to use it for predictions and inference."
+        )
+        return model
+    def get_text_features(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        token_type_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            #output_attentions=output_attentions,
+            #output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return text_outputs[0] # last_hidden_state
+    def get_image_features(
+        self,
+        pixel_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import VLEModel, AutoImageProcessor
+        >>> model = VLEModel.from_pretrained("clip-italian/clip-italian")
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            #output_attentions=output_attentions,
+            #output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = self.vision_model.vision_model.post_layernorm(vision_outputs[0])
+        return last_hidden_state
+    def get_input_embeddings(self):
+        return self.text_model.embeddings.word_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.text_model.embeddings.word_embeddings = new_embeddings
+class VLEForVQA(VLEPreTrainedModel):
+    def __init__(
+        self,
+        config: Optional[VLEConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        super().__init__(config)
+        self.vle = VLEModel(config, vision_model, text_model)
+        hidden_size = config.hidden_size
+        self.num_vqa_labels = len(self.config.id2label)
+        self.vqa_classifier = nn.Sequential(
+                                    nn.Linear(hidden_size * 2, hidden_size * 2),
+                                    nn.LayerNorm(hidden_size * 2),
+                                    nn.GELU(),
+                                    nn.Linear(hidden_size * 2, self.num_vqa_labels),
+        )
+        self.vqa_classifier.apply(self._init_weights)
+    def forward(self,
+                input_ids: Optional[torch.LongTensor],
+                pixel_values: Optional[torch.FloatTensor],
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                token_type_ids: Optional[torch.LongTensor] = None,
+                patch_ids = None,
+                vqa_labels = None,
+                vqa_scores = None,
+                return_loss: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], VLEForVQAOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        vle_output = self.vle(
+            input_ids = input_ids,
+            pixel_values = pixel_values,
+            attention_mask = attention_mask,
+            position_ids = position_ids,
+            token_type_ids = token_type_ids,
+            patch_ids = patch_ids,)
+        pooler_output = vle_output[0]
+        vqa_logits = self.vqa_classifier(pooler_output)
+        vqa_loss = None
+        if return_loss and vqa_labels is not None and vqa_scores is not None:
+            vqa_targets = torch.zeros(len(vqa_logits), self.num_vqa_labels,device=vqa_logits.device)
+            for i, (_label, _score) in enumerate(zip(vqa_labels, vqa_scores)):
+                for l, s in zip(_label, _score):
+                    vqa_targets[i, l] = s
+            vqa_loss = F.binary_cross_entropy_with_logits(vqa_logits, vqa_targets) * vqa_targets.shape[1]
+            # https://github.com/jnhwkim/ban-vqa/blob/master/train.py#L19
+        if not return_dict:
+            output = (vqa_logits,)
+            return ((vqa_loss,) + output) if vqa_loss is not None else output
+        return VLEForVQAOutput(
+            loss = vqa_loss,
+            logits = vqa_logits
+        )
+class VLEForITM(VLEPreTrainedModel):
+    def __init__(
+        self,
+        config: Optional[VLEConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        super().__init__(config)
+        self.vle = VLEModel(config, vision_model, text_model)
+        hidden_size = config.hidden_size
+        self.itm_score = ITMHead(hidden_size*2)
+        self.itm_score.apply(self._init_weights)
+    def forward(self,
+                input_ids: Optional[torch.LongTensor],
+                pixel_values: Optional[torch.FloatTensor],
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                token_type_ids: Optional[torch.LongTensor] = None,
+                patch_ids = None,
+                itm_labels = None,
+                return_loss: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], VLEForITMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        vle_output = self.vle(
+            input_ids = input_ids,
+            pixel_values = pixel_values,
+            attention_mask = attention_mask,
+            position_ids = position_ids,
+            token_type_ids = token_type_ids,
+            patch_ids = patch_ids,)
+        pooler_output = vle_output[0]
+        itm_logits = self.itm_score(pooler_output)
+        itm_loss = None
+        if return_loss and itm_labels is not None:
+            itm_loss = nn.functional.cross_entropy(itm_logits, torch.tensor(itm_labels).long().to(itm_logits.device))
+        if not return_dict:
+            output = (itm_logits,)
+            return ((itm_loss,) + output) if itm_loss is not None else output
+        return VLEForITMOutput(loss = itm_loss, logits = itm_logits)
+class VLEForPBC(VLEPreTrainedModel):
+    def __init__(
+        self,
+        config: Optional[VLEConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        super().__init__(config)
+        self.vle = VLEModel(config, vision_model, text_model)
+        hidden_size = config.hidden_size
+        self.pbc_classifier = nn.Sequential(
+                nn.Linear(hidden_size, hidden_size),
+                nn.LayerNorm(hidden_size),
+                nn.GELU(),
+                nn.Linear(hidden_size, 2),
+            )
+        self.pbc_classifier.apply(self._init_weights)
+    def forward(self,
+                input_ids: Optional[torch.LongTensor],
+                pixel_values: Optional[torch.FloatTensor],
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                token_type_ids: Optional[torch.LongTensor] = None,
+                patch_ids = None,
+                pbc_labels = None,
+                return_loss: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], VLEForPBCOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        vle_output = self.vle(
+            input_ids = input_ids,
+            pixel_values = pixel_values,
+            attention_mask = attention_mask,
+            position_ids = position_ids,
+            token_type_ids = token_type_ids,
+            patch_ids = patch_ids,)
+        image_embeds = vle_output['image_embeds']
+        pbc_logits = self.pbc_classifier(image_embeds[:,1:,:])
+        pbc_loss = None
+        if return_loss and pbc_labels is not None:
+            pbc_loss = F.cross_entropy(pbc_logits, torch.tensor(pbc_labels).long().to(pbc_logits.device))
+        if not return_dict:
+            output = (pbc_logits,)
+            return ((pbc_loss,) + output) if pbc_loss is not None else output
+        return VLEForPBCOutput(loss = pbc_loss, logits = pbc_logits)
+class VLEForMLM(VLEPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"mlm_score.1.predictions.decoder.weight",r"mlm_score.1.predictions.decoder.bias"]
+    def __init__(
+        self,
+        config: Optional[VLEConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        super().__init__(config)
+        self.vle = VLEModel(config, vision_model, text_model)
+        hidden_size = config.hidden_size
+        mlm_head = DebertaV2OnlyMLMHead(self.config.text_config)
+        mlm_transform = nn.Linear(hidden_size, self.config.text_config.hidden_size)
+        self.mlm_score = nn.Sequential(
+                        mlm_transform,
+                        mlm_head,
+                    )
+    def forward(self,
+                input_ids: Optional[torch.LongTensor],
+                pixel_values: Optional[torch.FloatTensor],
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                token_type_ids: Optional[torch.LongTensor] = None,
+                patch_ids = None,
+                mlm_labels = None,
+                return_loss: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], VLEForMLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        vle_output = self.vle(
+            input_ids = input_ids,
+            pixel_values = pixel_values,
+            attention_mask = attention_mask,
+            position_ids = position_ids,
+            token_type_ids = token_type_ids,
+            patch_ids = patch_ids,)
+        text_feats = vle_output.text_embeds
+        mlm_logits = self.mlm_score(text_feats)
+        mlm_loss = None
+        if return_loss and mlm_labels is not None:
+            mlm_loss = F.cross_entropy(
+                mlm_logits.view(-1, self.config.text_config.vocab_size),
+                mlm_labels.view(-1),
+                ignore_index=-100,
+            )
+        if not return_dict:
+            output = (mlm_logits,)
+            return ((mlm_loss,) + output) if mlm_loss is not None else output
+        return VLEForMLMOutput(loss = mlm_loss, logits = mlm_logits)
+    def get_output_embeddings(self):
+        return self.mlm_score[1].predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.mlm_score[1].predictions.decoder = new_embeddings

models/VLE/pipeline_vle.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+from transformers import Pipeline
+from PIL import Image
+from typing import Union
+from copy import deepcopy
+import matplotlib.pyplot as plt
+import io
+class VLEForVQAPipeline(Pipeline):
+    def __init__(self, vle_processor, *args, **kwargs):
+        self.vle_processor = vle_processor
+        super().__init__(*args, **kwargs)
+    def _sanitize_parameters(self, top_k=None, **kwargs):
+        preprocess_params, forward_params, postprocess_params = {}, {}, {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        return preprocess_params, forward_params, postprocess_params
+    def __call__(self, image: Union["Image.Image", str], question: str = None, **kwargs):
+        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
+            inputs = {"image": image, "question": question}
+        else:
+            """
+            Supports the following format
+            - {"image": image, "question": question}
+            - [{"image": image, "question": question}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+    def preprocess(self, inputs):
+        model_inputs = self.vle_processor(text=inputs['question'], images=inputs['image'], return_tensors="pt",padding=True)
+        return model_inputs
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+    def postprocess(self, model_outputs, top_k=1):
+        if top_k > self.model.num_vqa_labels:
+            top_k = self.model.num_vqa_labels
+        probs = torch.softmax(model_outputs['logits'], dim=-1)
+        probs, preds = torch.sort(probs, descending=True)
+        probs = probs[:,:top_k].tolist()[0]
+        preds = preds[:,:top_k].tolist()[0]
+        return [{"score": score, "answer": self.model.config.id2label[pred]} for score, pred in zip(probs, preds)]
+class VLEForPBCPipeline(Pipeline):
+    def __init__(self, vle_processor, *args, **kwargs):
+        self.vle_processor = vle_processor
+        self.id2label = {0:"False",1:"True"}
+        super().__init__(*args, **kwargs)
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params, forward_params, postprocess_params = {}, {}, {}
+        return preprocess_params, forward_params, postprocess_params
+    def __call__(self, image: Union["Image.Image", str], text: str = None, **kwargs):
+        if isinstance(image, (Image.Image, str)) and isinstance(text, str):
+            inputs = {"image": image, "text": text}
+        else:
+            """
+            Supports the following format
+            - {"image": image, "text": text}
+            - [{"image": image, "text": text}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+    def preprocess(self, inputs):
+        model_inputs = self.vle_processor(text=inputs['text'], images=inputs['image'], return_tensors="pt",padding=True)
+        return model_inputs, inputs['image']
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs[0])
+        return model_outputs, model_inputs[1]
+    def postprocess(self, model_outputs):
+        probs = torch.softmax(model_outputs[0]['logits'], dim=-1)
+        probs = probs.tolist()[0]
+        new_image = self.paint_in_image(model_outputs[0]['logits'], model_outputs[1])
+        return {"score": probs, "image": new_image}
+    def paint_in_image(self, logits, raw_image):
+        image_back = deepcopy(raw_image)
+        raw_image_size = image_back.size
+        resized_image_size = self.model.config.vision_config.image_size
+        patch_size = self.model.config.vision_config.patch_size
+        probs = torch.softmax(logits.detach()[0,:,1].to('cpu'),dim=-1).numpy().reshape(-1, resized_image_size//patch_size)
+        plt.close('all')
+        plt.axis('off')
+        plt.imshow(probs, cmap='gray', interpolation='None', vmin=(probs.max()-probs.min())*2/5+probs.min(),alpha=0.7)
+        plt.xticks([])
+        plt.yticks([])
+        buf = io.BytesIO()
+        plt.savefig(buf, dpi=100, transparent=True, bbox_inches='tight', pad_inches=0)
+        image_front = Image.open(buf)
+        def filter_image_front(img: Image.Image):
+            width, height = img.width, img.height
+            for x in range(width):
+                for y in range(height):
+                    r,g,b,a = img.getpixel((x,y))
+                    a = int (a * (1-r/255))
+                    img.putpixel((x,y), (r,g,b,a))
+            return img
+        image_front = filter_image_front(image_front).resize(raw_image_size)
+        image_back.paste(image_front, (0,0), image_front)
+        mixed_image = image_back.resize(raw_image_size)
+        buf.close()
+        return mixed_image
+class VLEForITMPipeline(Pipeline):
+    def __init__(self, vle_processor, *args, **kwargs):
+        self.vle_processor = vle_processor
+        self.id2label = {0:"False",1:"True"}
+        super().__init__(*args, **kwargs)
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params, forward_params, postprocess_params = {}, {}, {}
+        return preprocess_params, forward_params, postprocess_params
+    def __call__(self, image: Union["Image.Image", str], text: str = None, **kwargs):
+        if isinstance(image, (Image.Image, str)) and isinstance(text, str):
+            inputs = {"image": image, "text": text}
+        else:
+            """
+            Supports the following format
+            - {"image": image, "text": text}
+            - [{"image": image, "text": text}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+    def preprocess(self, inputs):
+        model_inputs = self.vle_processor(text=inputs['text'], images=inputs['image'], return_tensors="pt",padding=True)
+        return model_inputs
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+    def postprocess(self, model_outputs):
+        probs = torch.softmax(model_outputs['logits'], dim=-1)
+        preds = torch.argmax(probs, dim=-1)
+        probs = probs.tolist()[0]
+        preds = self.id2label[preds.tolist()[0]]
+        return {"score": probs, "match": preds}

models/VLE/processing_vle.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for VLE
+"""
+import warnings
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+class VLEProcessor(ProcessorMixin):
+    r"""
+    Constructs a VLE processor which wraps an image processor and a tokenizer into a single
+    processor.
+    [`VLEProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`AutoTokenizer`].
+    See the [`~VLEProcessor.__call__`] and [`~VLEProcessor.decode`] for more
+    information.
+    Args:
+        image_processor ([`AutoImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = "DebertaV2Tokenizer"
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs): #TODO more specific args?
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to VLETokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
+        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to VLETokenizer's
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to VLETokenizer's [`~PreTrainedTokenizer.decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor