Spaces:

truong-xuan-linh
/

VQA-datamining

Running

App Files Files Community

truong-xuan-linh commited on Jun 30, 2024

Commit

1d3d5c8

1 Parent(s): b2813ce

update visualize

Browse files

Files changed (8) hide show

app.py +38 -11
pre-requirements.txt +2 -0
src/feature_extraction.py +87 -76
src/image_visualization.py +21 -0
src/model.py +168 -52
src/ocr.py +68 -62
utils/config.py +1 -1
visualization/.gitkeep +0 -0

app.py CHANGED Viewed

@@ -2,11 +2,13 @@ import glob
 import streamlit as st
 from streamlit_image_select import image_select
-#Trick to not init function multitime
 if "model" not in st.session_state:
     print("INIT MODEL")
     from src.model import Model
     st.session_state.model = Model()
     print("DONE INIT MODEL")
@@ -16,17 +18,25 @@ hide_menu_style = """
 footer {visibility: hidden;}
 </style>
 """
-st.markdown(hide_menu_style, unsafe_allow_html= True)
 mapper = {
-        "images/000000000645.jpg": "Đây là đâu",
-        "images/000000000661.jpg": "Tốc độ tối đa trên đoạn đường này là bao nhiêu",
-        "images/000000000674.jpg": "Còn bao xa nữa là tới Huế",
-        "images/000000000706.jpg": "Cầu này dài bao nhiêu",
-        "images/000000000777.jpg": "Chè khúc bạch giá bao nhiêu"
 }
-image = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png", "webp", ])
 example = image_select("Examples", glob.glob("images/*.jpg"))
 if image:
@@ -40,10 +50,27 @@ else:
     st.session_state.question = mapper[example]
     st.session_state.image = example
-if 'image' in st.session_state:
     st.image(st.session_state.image)
     question = st.text_input("**Question:** ", value=st.session_state.question)
     if question:
-        answer = st.session_state.model.inference(st.session_state.image, question)
         st.write(f"**Answer:** {answer}")

 import streamlit as st
 from streamlit_image_select import image_select
+import streamlit.components.v1 as components
+# Trick to not init function multitime
 if "model" not in st.session_state:
     print("INIT MODEL")
     from src.model import Model
     st.session_state.model = Model()
     print("DONE INIT MODEL")
 footer {visibility: hidden;}
 </style>
 """
+st.markdown(hide_menu_style, unsafe_allow_html=True)
 mapper = {
+    "images/000000000645.jpg": "Đây là đâu",
+    "images/000000000661.jpg": "Tốc độ tối đa trên đoạn đường này là bao nhiêu",
+    "images/000000000674.jpg": "Còn bao xa nữa là tới Huế",
+    "images/000000000706.jpg": "Cầu này dài bao nhiêu",
+    "images/000000000777.jpg": "Chè khúc bạch giá bao nhiêu",
 }
+image = st.file_uploader(
+    "Choose an image file",
+    type=[
+        "jpg",
+        "jpeg",
+        "png",
+        "webp",
+    ],
+)
 example = image_select("Examples", glob.glob("images/*.jpg"))
 if image:
     st.session_state.question = mapper[example]
     st.session_state.image = example
+if "image" in st.session_state:
     st.image(st.session_state.image)
     question = st.text_input("**Question:** ", value=st.session_state.question)
+    visualize = True
     if question:
+        answer, text_attention_html, images_visualize = (
+            st.session_state.model.inference(
+                st.session_state.image, question, visualize
+            )
+        )
         st.write(f"**Answer:** {answer}")
+        if visualize:
+            st.write("**Explanation**")
+            col1, col2 = st.columns([1, 2])
+            # st.markdown(text_attention_html, unsafe_allow_html=True)
+            with col1:
+                st.write("*Text Attention*")
+                components.html(text_attention_html, height=960, scrolling=True)
+            with col2:
+                st.write("*Image Attention*")
+                for image_visualize in images_visualize:
+                    st.image(image_visualize)

pre-requirements.txt CHANGED Viewed

@@ -6,3 +6,5 @@ torchvision==0.18.0
 streamlit==1.35.0
 transformers==4.41.2
 streamlit-image-select==0.6.0

 streamlit==1.35.0
 transformers==4.41.2
 streamlit-image-select==0.6.0
+bertviz==1.4.0
+ipython==8.18.1

src/feature_extraction.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import torch
 import requests
 from PIL import Image, ImageFont, ImageDraw, ImageTransform
@@ -9,7 +8,9 @@ from src.ocr import OCRDetector
 class ViT:
     def __init__(self) -> None:
-        self.processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
         self.model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
         self.model.to(Config.device)
@@ -23,7 +24,9 @@ class ViT:
         with torch.no_grad():
             outputs = self.model(**inputs)
         last_hidden_states = outputs.last_hidden_state
-        attention_mask = torch.ones((last_hidden_states.shape[0], last_hidden_states.shape[1]))
         return last_hidden_states.to(Config.device), attention_mask.to(Config.device)
@@ -34,16 +37,20 @@ class ViT:
             image_outputs = self.model(**image_inputs)
             image_pooler_output = image_outputs.pooler_output
             image_pooler_output = torch.unsqueeze(image_pooler_output, 0)
-            image_attention_mask = torch.ones((image_pooler_output.shape[0], image_pooler_output.shape[1]))
-        return image_pooler_output.to(Config.device), image_attention_mask.to(Config.device)
 class OCR:
     def __init__(self) -> None:
         self.ocr_detector = OCRDetector()
     def extraction(self, image_dir):
         ocr_results = self.ocr_detector.text_detector(image_dir)
         if not ocr_results:
             print("NOT OCR1")
@@ -53,7 +60,6 @@ class OCR:
         ocrs = self.post_process(ocr_results)
         if not ocrs:
             return "", [], []
         ocrs.reverse()
@@ -74,10 +80,9 @@ class OCR:
         ocr_content = " ".join(ocr_content.split())
         ocr_content = "<extra_id_0>" + ocr_content
         return ocr_content, groups_box, paragraph_boxes
-    def post_process(self,ocr_results):
         ocrs = []
         for result in ocr_results:
             text = result["text"]
@@ -96,10 +101,7 @@ class OCR:
             # if w*h < 300:
             #   continue
-            ocrs.append(
-                {"text": text.lower(),
-                "box": box}
-            )
         return ocrs
     @staticmethod
@@ -107,87 +109,96 @@ class OCR:
         (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
         w = x2 - x1
         h = y4 - y1
-        scl = h//7
-        new_box = [max(x1-scl,0), max(y1 - scl, 0)], [x2+scl, y2-scl], [x3+scl, y3+scl], [x4-scl, y4+scl]
         (x1, y1), (x2, y2), (x3, y3), (x4, y4) = new_box
         # Define 8-tuple with x,y coordinates of top-left, bottom-left, bottom-right and top-right corners and apply
         transform = [x1, y1, x4, y4, x3, y3, x2, y2]
-        result = image.transform((w,h), ImageTransform.QuadTransform(transform))
         return result
     @staticmethod
     def check_point_in_rectangle(box, point, padding_devide):
-      (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
-      x_min = min(x1, x4)
-      x_max = max(x2, x3)
-      padding = (x_max-x_min)//padding_devide
-      x_min = x_min - padding
-      x_max = x_max + padding
-      y_min = min(y1, y2)
-      y_max = max(y3, y4)
-      y_min = y_min - padding
-      y_max = y_max + padding
-      x, y = point
-      if x >= x_min and x <= x_max and y >= y_min and y <= y_max:
-        return True
-      return False
     @staticmethod
     def check_rectangle_overlap(rec1, rec2, padding_devide):
-      for point in rec1:
-        if OCR.check_point_in_rectangle(rec2, point, padding_devide):
-          return True
-      for point in rec2:
-        if OCR.check_point_in_rectangle(rec1, point, padding_devide):
-          return True
-      return False
     @staticmethod
     def group_boxes(boxes, texts):
-      groups = []
-      groups_text = []
-      paragraph_boxes = []
-      processed = []
-      boxes_cp = boxes.copy()
-      for i, (box, text) in enumerate(zip(boxes_cp, texts)):
-        (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
-        if i not in processed:
-          processed.append(i)
-        else:
-          continue
-        groups.append([box])
-        groups_text.append([text])
-        for j, (box2, text2) in enumerate(zip(boxes_cp[i+1:], texts[i+1:])):
-          if j+i+1 in processed:
-            continue
-          padding_devide = len(groups[-1])*4
-          is_overlap = OCR.check_rectangle_overlap(box, box2, padding_devide)
-          if is_overlap:
-            (xx1, yy1), (xx2, yy2), (xx3, yy3), (xx4, yy4) = box2
-            processed.append(j+i+1)
-            groups[-1].append(box2)
-            groups_text[-1].append(text2)
-            new_x1 = min(x1, xx1)
-            new_y1 = min(y1, yy1)
-            new_x2 = max(x2, xx2)
-            new_y2 = min(y2, yy2)
-            new_x3 = max(x3, xx3)
-            new_y3 = max(y3, yy3)
-            new_x4 = min(x4, xx4)
-            new_y4 = max(y4, yy4)
-            box = [(new_x1, new_y1), (new_x2, new_y2), (new_x3, new_y3), (new_x4, new_y4)]
-        paragraph_boxes.append(box)
-      return groups, groups_text, paragraph_boxes

 import torch
 import requests
 from PIL import Image, ImageFont, ImageDraw, ImageTransform
 class ViT:
     def __init__(self) -> None:
+        self.processor = AutoImageProcessor.from_pretrained(
+            "google/vit-base-patch16-224-in21k"
+        )
         self.model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
         self.model.to(Config.device)
         with torch.no_grad():
             outputs = self.model(**inputs)
         last_hidden_states = outputs.last_hidden_state
+        attention_mask = torch.ones(
+            (last_hidden_states.shape[0], last_hidden_states.shape[1])
+        )
         return last_hidden_states.to(Config.device), attention_mask.to(Config.device)
             image_outputs = self.model(**image_inputs)
             image_pooler_output = image_outputs.pooler_output
             image_pooler_output = torch.unsqueeze(image_pooler_output, 0)
+            image_attention_mask = torch.ones(
+                (image_pooler_output.shape[0], image_pooler_output.shape[1])
+            )
+        return image_pooler_output.to(Config.device), image_attention_mask.to(
+            Config.device
+        )
 class OCR:
     def __init__(self) -> None:
         self.ocr_detector = OCRDetector()
     def extraction(self, image_dir):
         ocr_results = self.ocr_detector.text_detector(image_dir)
         if not ocr_results:
             print("NOT OCR1")
         ocrs = self.post_process(ocr_results)
         if not ocrs:
             return "", [], []
         ocrs.reverse()
         ocr_content = " ".join(ocr_content.split())
         ocr_content = "<extra_id_0>" + ocr_content
         return ocr_content, groups_box, paragraph_boxes
+    def post_process(self, ocr_results):
         ocrs = []
         for result in ocr_results:
             text = result["text"]
             # if w*h < 300:
             #   continue
+            ocrs.append({"text": text.lower(), "box": box})
         return ocrs
     @staticmethod
         (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
         w = x2 - x1
         h = y4 - y1
+        scl = h // 7
+        new_box = (
+            [max(x1 - scl, 0), max(y1 - scl, 0)],
+            [x2 + scl, y2 - scl],
+            [x3 + scl, y3 + scl],
+            [x4 - scl, y4 + scl],
+        )
         (x1, y1), (x2, y2), (x3, y3), (x4, y4) = new_box
         # Define 8-tuple with x,y coordinates of top-left, bottom-left, bottom-right and top-right corners and apply
         transform = [x1, y1, x4, y4, x3, y3, x2, y2]
+        result = image.transform((w, h), ImageTransform.QuadTransform(transform))
         return result
     @staticmethod
     def check_point_in_rectangle(box, point, padding_devide):
+        (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
+        x_min = min(x1, x4)
+        x_max = max(x2, x3)
+        padding = (x_max - x_min) // padding_devide
+        x_min = x_min - padding
+        x_max = x_max + padding
+        y_min = min(y1, y2)
+        y_max = max(y3, y4)
+        y_min = y_min - padding
+        y_max = y_max + padding
+        x, y = point
+        if x >= x_min and x <= x_max and y >= y_min and y <= y_max:
+            return True
+        return False
     @staticmethod
     def check_rectangle_overlap(rec1, rec2, padding_devide):
+        for point in rec1:
+            if OCR.check_point_in_rectangle(rec2, point, padding_devide):
+                return True
+        for point in rec2:
+            if OCR.check_point_in_rectangle(rec1, point, padding_devide):
+                return True
+        return False
     @staticmethod
     def group_boxes(boxes, texts):
+        groups = []
+        groups_text = []
+        paragraph_boxes = []
+        processed = []
+        boxes_cp = boxes.copy()
+        for i, (box, text) in enumerate(zip(boxes_cp, texts)):
+            (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
+            if i not in processed:
+                processed.append(i)
+            else:
+                continue
+            groups.append([box])
+            groups_text.append([text])
+            for j, (box2, text2) in enumerate(zip(boxes_cp[i + 1 :], texts[i + 1 :])):
+                if j + i + 1 in processed:
+                    continue
+                padding_devide = len(groups[-1]) * 4
+                is_overlap = OCR.check_rectangle_overlap(box, box2, padding_devide)
+                if is_overlap:
+                    (xx1, yy1), (xx2, yy2), (xx3, yy3), (xx4, yy4) = box2
+                    processed.append(j + i + 1)
+                    groups[-1].append(box2)
+                    groups_text[-1].append(text2)
+                    new_x1 = min(x1, xx1)
+                    new_y1 = min(y1, yy1)
+                    new_x2 = max(x2, xx2)
+                    new_y2 = min(y2, yy2)
+                    new_x3 = max(x3, xx3)
+                    new_y3 = max(y3, yy3)
+                    new_x4 = min(x4, xx4)
+                    new_y4 = max(y4, yy4)
+                    box = [
+                        (new_x1, new_y1),
+                        (new_x2, new_y2),
+                        (new_x3, new_y3),
+                        (new_x4, new_y4),
+                    ]
+            paragraph_boxes.append(box)
+        return groups, groups_text, paragraph_boxes

src/image_visualization.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import matplotlib.pyplot as plt
+# Show attention
+def plot_attention(img, result, attention_plot, image_dir):
+    # img = img.numpy().transpose((1, 2, 0))
+    temp_image = img
+    fig = plt.figure(figsize=(15, 15))
+    len_result = len(result)
+    for l in range(len_result):
+        temp_att = attention_plot[l][1:].reshape(14, 14)
+        # temp_att = np.resize(attention_plot[l].detach().numpy(),(98,98))
+        ax = fig.add_subplot(len_result // 2, len_result // 2, l + 1)
+        ax.set_title(result[l], fontsize=18)
+        img = ax.imshow(temp_image)
+        ax.imshow(temp_att, alpha=0.6, cmap="jet", extent=img.get_extent())
+    plt.tight_layout()
+    plt.savefig(image_dir)

src/model.py CHANGED Viewed

@@ -8,12 +8,16 @@ from typing import *
 from transformers import T5ForConditionalGeneration, AutoTokenizer
 from utils.config import Config
 from src.feature_extraction import ViT, OCR
 _CONFIG_FOR_DOC = "T5Config"
 _CHECKPOINT_FOR_DOC = "google-t5/t5-small"
-class CustomT5Stack(T5Stack):
     def forward(
         self,
         input_ids=None,
@@ -35,11 +39,19 @@ class CustomT5Stack(T5Stack):
             torch.cuda.set_device(self.first_device)
             self.embed_tokens = self.embed_tokens.to(self.first_device)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None and inputs_embeds is not None:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
@@ -53,11 +65,15 @@ class CustomT5Stack(T5Stack):
             input_shape = inputs_embeds.size()[:-1]
         else:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
         if inputs_embeds is None:
             if self.embed_tokens is None:
-                raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
             if not self.is_decoder and images_embeds is not None:
                 inputs_embeds = torch.concat([inputs_embeds, images_embeds], dim=1)
@@ -66,33 +82,47 @@ class CustomT5Stack(T5Stack):
         batch_size, seq_length = input_shape
         # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
         if use_cache is True:
             if not self.is_decoder:
-                raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
         # initialize past_key_values with `None` if past does not exist
         if past_key_values is None:
             past_key_values = [None] * len(self.block)
         if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(
                     encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
                 )
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
@@ -105,7 +135,9 @@ class CustomT5Stack(T5Stack):
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
         present_key_value_states = () if use_cache else None
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -115,7 +147,9 @@ class CustomT5Stack(T5Stack):
         hidden_states = self.dropout(inputs_embeds)
-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
             layer_head_mask = head_mask[i]
             cross_attn_layer_head_mask = cross_attn_head_mask[i]
             # Model parallel
@@ -127,15 +161,23 @@ class CustomT5Stack(T5Stack):
                 if position_bias is not None:
                     position_bias = position_bias.to(hidden_states.device)
                 if encoder_hidden_states is not None:
-                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
                 if encoder_extended_attention_mask is not None:
-                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
                 if encoder_decoder_position_bias is not None:
-                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
                 if layer_head_mask is not None:
                     layer_head_mask = layer_head_mask.to(hidden_states.device)
                 if cross_attn_layer_head_mask is not None:
-                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -181,10 +223,14 @@ class CustomT5Stack(T5Stack):
             # (cross-attention position bias), (cross-attention weights)
             position_bias = layer_outputs[2]
             if self.is_decoder and encoder_hidden_states is not None:
-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
             # append next layer key value states
             if use_cache:
-                present_key_value_states = present_key_value_states + (present_key_value_state,)
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[3],)
@@ -227,7 +273,9 @@ class CustomT5Stack(T5Stack):
 class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
     @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -280,7 +328,9 @@ class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
         >>> # studies have shown that owning a dog is good for you.
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
         if head_mask is not None and decoder_head_mask is None:
@@ -299,7 +349,7 @@ class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
-                images_embeds=images_embeds
             )
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
@@ -313,7 +363,11 @@ class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
         if self.model_parallel:
             torch.cuda.set_device(self.decoder.first_device)
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
             # get decoder inputs from shifting lm labels to the right
             decoder_input_ids = self._shift_right(labels)
@@ -326,7 +380,9 @@ class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
             if attention_mask is not None:
                 attention_mask = attention_mask.to(self.decoder.first_device)
             if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
         # Decode
         decoder_outputs = self.decoder(
@@ -382,64 +438,124 @@ class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
         )
 transformers.models.t5.modeling_t5.T5Stack = CustomT5Stack
-transformers.models.t5.modeling_t5.T5ForConditionalGeneration = CustomT5ForConditionalGeneration
 transformers.T5ForConditionalGeneration = CustomT5ForConditionalGeneration
 class Model:
     def __init__(self) -> None:
         os.makedirs("storage", exist_ok=True)
         if not os.path.exists("storage/vlsp_transfomer_vietocr.pth"):
             print("DOWNLOADING model")
-            gdown.download(Config.model_url, output="storage/vlsp_transfomer_vietocr.pth")
         self.vit5_tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
-        self.model = T5ForConditionalGeneration.from_pretrained("truong-xuan-linh/VQA-vit5",
-                                                                revision=Config.revision,
-                                                                output_attentions=True)
         self.model.to(Config.device)
         self.vit = ViT()
         self.ocr = OCR()
     def get_inputs(self, image_dir: str, question: str):
-        #VIT
         image_feature, image_mask = self.vit.extraction(image_dir)
         ocr_content, groups_box, paragraph_boxes = self.ocr.extraction(image_dir)
         print("Input: ", question + " " + ocr_content)
-        #VIT5
-        input_ = self.vit5_tokenizer(question + " " + ocr_content,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=Config.question_maxlen + Config.ocr_maxlen,
-                    return_tensors="pt")
         input_ids = input_.input_ids
         attention_mask = input_.attention_mask
         mask = torch.cat((attention_mask, image_mask), 1)
         return {
-                  "input_ids": input_ids,
-                  "attention_mask": mask,
-                  "images_embeds": image_feature,
-              }
-    def inference(self, image_dir: str, question: str):
         inputs = self.get_inputs(image_dir, question)
         with torch.no_grad():
             input_ids = inputs["input_ids"]
             attention_mask = inputs["attention_mask"]
             images_embeds = inputs["images_embeds"]
             generated_ids = self.model.generate(
-                                input_ids=input_ids, \
-                                attention_mask=attention_mask, \
-                                images_embeds=images_embeds, \
-                                num_beams=2,
-                                max_length=Config.answer_maxlen
-                            )
-            pred_answer = self.vit5_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        return pred_answer

 from transformers import T5ForConditionalGeneration, AutoTokenizer
 from utils.config import Config
 from src.feature_extraction import ViT, OCR
+from bertviz import model_view, head_view
+from src.image_visualization import plot_attention
+import numpy as np
+from PIL import Image
 _CONFIG_FOR_DOC = "T5Config"
 _CHECKPOINT_FOR_DOC = "google-t5/t5-small"
+class CustomT5Stack(T5Stack):
     def forward(
         self,
         input_ids=None,
             torch.cuda.set_device(self.first_device)
             self.embed_tokens = self.embed_tokens.to(self.first_device)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         if input_ids is not None and inputs_embeds is not None:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
             input_shape = inputs_embeds.size()[:-1]
         else:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
+            )
         if inputs_embeds is None:
             if self.embed_tokens is None:
+                raise ValueError(
+                    "You have to initialize the model with valid token embeddings"
+                )
             inputs_embeds = self.embed_tokens(input_ids)
             if not self.is_decoder and images_embeds is not None:
                 inputs_embeds = torch.concat([inputs_embeds, images_embeds], dim=1)
         batch_size, seq_length = input_shape
         # required mask seq length can be calculated via length of past
+        mask_seq_length = (
+            past_key_values[0][0].shape[2] + seq_length
+            if past_key_values is not None
+            else seq_length
+        )
         if use_cache is True:
             if not self.is_decoder:
+                raise ValueError(
+                    f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+                )
         # initialize past_key_values with `None` if past does not exist
         if past_key_values is None:
             past_key_values = [None] * len(self.block)
         if attention_mask is None:
+            attention_mask = torch.ones(
+                batch_size, mask_seq_length, device=inputs_embeds.device
+            )
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape
+        )
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = (
+                encoder_hidden_states.size()
+            )
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(
                     encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
                 )
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask
+            )
         else:
             encoder_extended_attention_mask = None
         # Prepare head mask if needed
         head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(
+            cross_attn_head_mask, self.config.num_layers
+        )
         present_key_value_states = () if use_cache else None
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
         hidden_states = self.dropout(inputs_embeds)
+        for i, (layer_module, past_key_value) in enumerate(
+            zip(self.block, past_key_values)
+        ):
             layer_head_mask = head_mask[i]
             cross_attn_layer_head_mask = cross_attn_head_mask[i]
             # Model parallel
                 if position_bias is not None:
                     position_bias = position_bias.to(hidden_states.device)
                 if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(
+                        hidden_states.device
+                    )
                 if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = (
+                        encoder_extended_attention_mask.to(hidden_states.device)
+                    )
                 if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(
+                        hidden_states.device
+                    )
                 if layer_head_mask is not None:
                     layer_head_mask = layer_head_mask.to(hidden_states.device)
                 if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(
+                        hidden_states.device
+                    )
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
             # (cross-attention position bias), (cross-attention weights)
             position_bias = layer_outputs[2]
             if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[
+                    4 if output_attentions else 3
+                ]
             # append next layer key value states
             if use_cache:
+                present_key_value_states = present_key_value_states + (
+                    present_key_value_state,
+                )
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[3],)
 class CustomT5ForConditionalGeneration(T5ForConditionalGeneration):
     @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+    )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         >>> # studies have shown that owning a dog is good for you.
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
         if head_mask is not None and decoder_head_mask is None:
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
+                images_embeds=images_embeds,
             )
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
         if self.model_parallel:
             torch.cuda.set_device(self.decoder.first_device)
+        if (
+            labels is not None
+            and decoder_input_ids is None
+            and decoder_inputs_embeds is None
+        ):
             # get decoder inputs from shifting lm labels to the right
             decoder_input_ids = self._shift_right(labels)
             if attention_mask is not None:
                 attention_mask = attention_mask.to(self.decoder.first_device)
             if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device
+                )
         # Decode
         decoder_outputs = self.decoder(
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
         )
 transformers.models.t5.modeling_t5.T5Stack = CustomT5Stack
+transformers.models.t5.modeling_t5.T5ForConditionalGeneration = (
+    CustomT5ForConditionalGeneration
+)
 transformers.T5ForConditionalGeneration = CustomT5ForConditionalGeneration
+from transformers import T5ForConditionalGeneration
 class Model:
     def __init__(self) -> None:
         os.makedirs("storage", exist_ok=True)
         if not os.path.exists("storage/vlsp_transfomer_vietocr.pth"):
             print("DOWNLOADING model")
+            gdown.download(
+                Config.model_url, output="storage/vlsp_transfomer_vietocr.pth"
+            )
         self.vit5_tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
+        self.model = T5ForConditionalGeneration.from_pretrained(
+            "truong-xuan-linh/VQA-vit5",
+            revision=Config.revision,
+            output_attentions=True,
+        )
         self.model.to(Config.device)
         self.vit = ViT()
         self.ocr = OCR()
     def get_inputs(self, image_dir: str, question: str):
+        # VIT
         image_feature, image_mask = self.vit.extraction(image_dir)
         ocr_content, groups_box, paragraph_boxes = self.ocr.extraction(image_dir)
         print("Input: ", question + " " + ocr_content)
+        # VIT5
+        input_ = self.vit5_tokenizer(
+            question + " " + ocr_content,
+            padding="max_length",
+            truncation=True,
+            max_length=Config.question_maxlen + Config.ocr_maxlen,
+            return_tensors="pt",
+        )
         input_ids = input_.input_ids
         attention_mask = input_.attention_mask
         mask = torch.cat((attention_mask, image_mask), 1)
         return {
+            "input_ids": input_ids,
+            "attention_mask": mask,
+            "images_embeds": image_feature,
+        }
+    def inference(self, image_dir: str, question: str, explain: bool = False):
         inputs = self.get_inputs(image_dir, question)
         with torch.no_grad():
             input_ids = inputs["input_ids"]
             attention_mask = inputs["attention_mask"]
             images_embeds = inputs["images_embeds"]
             generated_ids = self.model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                images_embeds=images_embeds,
+                num_beams=2,
+                max_length=Config.answer_maxlen,
+            )
+            pred_answer = self.vit5_tokenizer.decode(
+                generated_ids[0], skip_special_tokens=True
+            )
+        if not explain:
+            return pred_answer, None, None
+        with self.vit5_tokenizer.as_target_tokenizer():
+            decoder_input_ids = self.vit5_tokenizer(
+                pred_answer, return_tensors="pt", add_special_tokens=True
+            ).input_ids
+        with torch.no_grad():
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                images_embeds=images_embeds,
+                decoder_input_ids=decoder_input_ids,
+            )
+        encoder_text = self.vit5_tokenizer.convert_ids_to_tokens(input_ids[0])
+        decoder_text = self.vit5_tokenizer.convert_ids_to_tokens(decoder_input_ids[0])
+        while "<pad>" in encoder_text:
+            encoder_text.remove("<pad>")
+        text_encoder_attentions = [
+            att[:, :, : len(encoder_text), : len(encoder_text)]
+            for att in outputs.encoder_attentions
+        ]
+        text_cross_attentions = [
+            att[:, :, :, : len(encoder_text)] for att in outputs.cross_attentions
+        ]
+        html_output = head_view(
+            encoder_attention=text_encoder_attentions,
+            decoder_attention=outputs.decoder_attentions,
+            cross_attention=text_cross_attentions,
+            encoder_tokens=encoder_text[: len(encoder_text)],
+            decoder_tokens=decoder_text,
+            # display_mode="light",
+            html_action="return",
+        )
+        img = Image.open(image_dir).convert("RGB")
+        image_dirs = []
+        for i in range(len(outputs.cross_attentions[:1])):
+            image_dir = f"visualization/test_image_visualize_{i}.jpg"
+            image_dirs.append(image_dir)
+            attention_plot = np.mean(
+                outputs.cross_attentions[i][0, :, :, -197:].detach().numpy(), axis=0
+            )
+            plot_attention(img, decoder_text, attention_plot, image_dir)
+        return pred_answer, html_output.data, image_dirs

src/ocr.py CHANGED Viewed

@@ -6,74 +6,80 @@ import requests
 import numpy as np
 from PIL import Image, ImageTransform
 class OCRDetector:
-  def __init__(self) -> None:
-    self.paddle_ocr = PaddleOCR(lang='en',
-                                use_angle_cls=False,
-                                use_gpu=True if Config.device == "cpu" else False,
-                                show_log=False )
-    # config['weights'] = './weights/transformerocr.pth'
-    vietocr_config = Cfg.load_config_from_name('vgg_transformer')
-    vietocr_config['weights'] = Config.ocr_path
-    vietocr_config['cnn']['pretrained']=False
-    vietocr_config['device'] = Config.device
-    vietocr_config['predictor']['beamsearch']=False
-    self.viet_ocr = Predictor(vietocr_config)
-  def find_box(self, image):
-    '''Xác định box dựa vào mô hình paddle_ocr'''
-    result = self.paddle_ocr.ocr(image, cls = False, rec=False)
-    result = result[0]
-    # Extracting detected components
-    boxes = result #[res[0] for res in result]
-    boxes = np.array(boxes).astype(int)
-    # scores = [res[1][1] for res in result]
-    return boxes
-  def cut_image_polygon(self, image, box):
-    (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
-    w = x2 - x1
-    h = y4 - y1
-    scl = h//7
-    new_box = [max(x1-scl,0), max(y1 - scl, 0)], [x2+scl, y2-scl], [x3+scl, y3+scl], [x4-scl, y4+scl]
-    (x1, y1), (x2, y2), (x3, y3), (x4, y4) = new_box
-    # Define 8-tuple with x,y coordinates of top-left, bottom-left, bottom-right and top-right corners and apply
-    transform = [x1, y1, x4, y4, x3, y3, x2, y2]
-    result = image.transform((w,h), ImageTransform.QuadTransform(transform))
-    return result
-  def vietnamese_text(self, boxes, image):
-    '''Xác định text dựa vào mô hình viet_ocr'''
-    results = []
-    for box in boxes:
-      try:
-        cut_image = self.cut_image_polygon(image, box)
-        # cut_image = Image.fromarray(np.uint8(cut_image))
-        text, score = self.viet_ocr.predict(cut_image, return_prob=True)
-        if score > Config.vietocr_threshold:
-          results.append({"text": text,
-                        "score": score,
-                        "box": box})
-      except:
-        continue
-    return results
-  #Merge
-  def text_detector(self, image_path):
-    if image_path.startswith("https://"):
-        image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
-    else:
-        image = Image.open(image_path).convert("RGB")
-    # np_image = np.array(image)
-    boxes = self.find_box(image_path)
-    if not boxes.any():
-        return None
-    results = self.vietnamese_text(boxes, image)
-    if results != []:
-        return results
-    else:
-        return None

 import numpy as np
 from PIL import Image, ImageTransform
 class OCRDetector:
+    def __init__(self) -> None:
+        self.paddle_ocr = PaddleOCR(
+            lang="en",
+            use_angle_cls=False,
+            use_gpu=True if Config.device == "cpu" else False,
+            show_log=False,
+        )
+        # config['weights'] = './weights/transformerocr.pth'
+        vietocr_config = Cfg.load_config_from_name("vgg_transformer")
+        vietocr_config["weights"] = Config.ocr_path
+        vietocr_config["cnn"]["pretrained"] = False
+        vietocr_config["device"] = Config.device
+        vietocr_config["predictor"]["beamsearch"] = False
+        self.viet_ocr = Predictor(vietocr_config)
+    def find_box(self, image):
+        """Xác định box dựa vào mô hình paddle_ocr"""
+        result = self.paddle_ocr.ocr(image, cls=False, rec=False)
+        result = result[0]
+        # Extracting detected components
+        boxes = result  # [res[0] for res in result]
+        boxes = np.array(boxes).astype(int)
+        # scores = [res[1][1] for res in result]
+        return boxes
+    def cut_image_polygon(self, image, box):
+        (x1, y1), (x2, y2), (x3, y3), (x4, y4) = box
+        w = x2 - x1
+        h = y4 - y1
+        scl = h // 7
+        new_box = (
+            [max(x1 - scl, 0), max(y1 - scl, 0)],
+            [x2 + scl, y2 - scl],
+            [x3 + scl, y3 + scl],
+            [x4 - scl, y4 + scl],
+        )
+        (x1, y1), (x2, y2), (x3, y3), (x4, y4) = new_box
+        # Define 8-tuple with x,y coordinates of top-left, bottom-left, bottom-right and top-right corners and apply
+        transform = [x1, y1, x4, y4, x3, y3, x2, y2]
+        result = image.transform((w, h), ImageTransform.QuadTransform(transform))
+        return result
+    def vietnamese_text(self, boxes, image):
+        """Xác định text dựa vào mô hình viet_ocr"""
+        results = []
+        for box in boxes:
+            try:
+                cut_image = self.cut_image_polygon(image, box)
+                # cut_image = Image.fromarray(np.uint8(cut_image))
+                text, score = self.viet_ocr.predict(cut_image, return_prob=True)
+                if score > Config.vietocr_threshold:
+                    results.append({"text": text, "score": score, "box": box})
+            except:
+                continue
+        return results
+    # Merge
+    def text_detector(self, image_path):
+        if image_path.startswith("https://"):
+            image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
+        else:
+            image = Image.open(image_path).convert("RGB")
+        # np_image = np.array(image)
+        boxes = self.find_box(image_path)
+        if not boxes.any():
+            return None
+        results = self.vietnamese_text(boxes, image)
+        if results != []:
+            return results
+        else:
+            return None

utils/config.py CHANGED Viewed

@@ -10,4 +10,4 @@ class Config:
     ocr_maxobj = 10000
     num_ocr = 32
     num_beams = 3
-    revision = "version_2_with_extra_id_0"

     ocr_maxobj = 10000
     num_ocr = 32
     num_beams = 3
+    revision = "version_2_with_extra_id_0"

visualization/.gitkeep ADDED Viewed

File without changes