jadechoghari
/

Ferret-UI-Llama8b

@@ -1,9 +1,13 @@
 import torch
 from PIL import Image
 from conversation import conv_templates
-from builder import load_pretrained_model
 from functools import partial
 import numpy as np
 # define the task categories
 box_in_tasks = ['widgetcaptions', 'taperception', 'ocr', 'icon_recognition', 'widget_classification', 'example_0']
@@ -36,20 +40,20 @@ def generate_mask_for_feature(coor, raw_w, raw_h, mask=None):
         if mask is not None:
             coor_mask = coor_mask * mask
-    # Convert to torch tensor and ensure it contains non-zero values
     coor_mask = torch.from_numpy(coor_mask)
     assert len(coor_mask.nonzero()) != 0, "Generated mask is empty :("
     return coor_mask
-## choose a conv_mode based on the model type from conversation.py
-def infer_single_prompt(image_path, prompt, model_path, region=None, model_name="ferret_llama", conv_mode="ferret_llama_3"):
     img = Image.open(image_path).convert('RGB')
     # this loads the model, image processor and tokenizer
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
-    # define the image size (e.g., 224x224 or 336x336)
     image_size = {"height": 336, "width": 336}
     # process the image
@@ -68,17 +72,27 @@ def infer_single_prompt(image_path, prompt, model_path, region=None, model_name=
     conv.append_message(conv.roles[0], prompt)
     conv.append_message(conv.roles[1], None)
     prompt_input = conv.get_prompt()
-    # tokenize prompt
-    input_ids = tokenizer(prompt_input, return_tensors='pt')['input_ids'].cuda()
     # region mask logic (if region is provided)
     region_masks = None
-    if region is not None:
         raw_w, raw_h = img.size
         region_masks = generate_mask_for_feature(region, raw_w, raw_h).unsqueeze(0).cuda().half()
-        region_masks = [[region_masks]]  # Wrap the mask in lists as expected by the model
     # generate model output
     with torch.inference_mode():
         # Use region_masks in model's forward call
@@ -87,9 +101,11 @@ def infer_single_prompt(image_path, prompt, model_path, region=None, model_name=
             model.orig_forward,
             region_masks=region_masks
         )
         output_ids = model.generate(
             input_ids,
             images=image_tensor,
             max_new_tokens=1024,
             num_beams=1,
             region_masks=region_masks,  # pass the region mask to the model
@@ -102,7 +118,8 @@ def infer_single_prompt(image_path, prompt, model_path, region=None, model_name=
     return output_text.strip()
 # We also define a task-specific inference function
-def infer_ui_task(image_path, prompt, model_path, task, region=None):
     """
     Handles task types: box_in_tasks, box_out_tasks, no_box_tasks.
     """
@@ -111,7 +128,7 @@ def infer_ui_task(image_path, prompt, model_path, task, region=None):
     if task in box_in_tasks:
         print(f"Processing {task} with bounding box region.")
-        return infer_single_prompt(image_path, prompt, model_path, region)
     elif task in box_out_tasks:
         print(f"Processing {task} without bounding box region.")
@@ -122,4 +139,4 @@ def infer_ui_task(image_path, prompt, model_path, task, region=None):
         return infer_single_prompt(image_path, prompt, model_path)
     else:
-        raise ValueError(f"Unknown task type: {task}")

 import torch
 from PIL import Image
 from conversation import conv_templates
+from builder import load_pretrained_model  # Assuming this is your custom model loader
 from functools import partial
 import numpy as np
+DEFAULT_REGION_FEA_TOKEN = "<region_fea>"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
 # define the task categories
 box_in_tasks = ['widgetcaptions', 'taperception', 'ocr', 'icon_recognition', 'widget_classification', 'example_0']
         if mask is not None:
             coor_mask = coor_mask * mask
+    # convert to torch tensor and ensure it contains non-zero values
     coor_mask = torch.from_numpy(coor_mask)
     assert len(coor_mask.nonzero()) != 0, "Generated mask is empty :("
     return coor_mask
+def infer_single_prompt(image_path, prompt, model_path, region=None, model_name="ferret_llama", conv_mode="ferret_llama_3", add_region_feature=False):
     img = Image.open(image_path).convert('RGB')
     # this loads the model, image processor and tokenizer
     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
+    # define the image size required by clip
     image_size = {"height": 336, "width": 336}
     # process the image
     conv.append_message(conv.roles[0], prompt)
     conv.append_message(conv.roles[1], None)
     prompt_input = conv.get_prompt()
+    # add the special tokens
+    prompt_input = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt_input
     # region mask logic (if region is provided)
     region_masks = None
+    if add_region_feature and region is not None:
         raw_w, raw_h = img.size
         region_masks = generate_mask_for_feature(region, raw_w, raw_h).unsqueeze(0).cuda().half()
+        region_masks = [[region_mask_i.cuda().half() for region_mask_i in region_masks]]
+        prompt_input = prompt_input.replace("<bbox_location0>", f"[{region[0]}, {region[1]}, {region[2]}, {region[3]}] {DEFAULT_REGION_FEA_TOKEN}")
+    # tokenize prompt
+    # input_ids = tokenizer(prompt_input, return_tensors='pt')['input_ids'].cuda()
+    inputs = tokenizer(prompt_input, return_tensors='pt', padding=True)
+    input_ids = inputs['input_ids'].cuda()
+    attention_mask = inputs['attention_mask'].cuda()
     # generate model output
     with torch.inference_mode():
         # Use region_masks in model's forward call
             model.orig_forward,
             region_masks=region_masks
         )
+        # explcit add of attention mask
         output_ids = model.generate(
             input_ids,
             images=image_tensor,
+            attention_mask=attention_mask,
             max_new_tokens=1024,
             num_beams=1,
             region_masks=region_masks,  # pass the region mask to the model
     return output_text.strip()
 # We also define a task-specific inference function
+def infer_ui_task(image_path, prompt, model_path, task, region=None, add_region_feature=False):
+    # region = torch.tensor(region).cuda()
     """
     Handles task types: box_in_tasks, box_out_tasks, no_box_tasks.
     """
     if task in box_in_tasks:
         print(f"Processing {task} with bounding box region.")
+        return infer_single_prompt(image_path, prompt, model_path, region, add_region_feature=add_region_feature)
     elif task in box_out_tasks:
         print(f"Processing {task} without bounding box region.")
         return infer_single_prompt(image_path, prompt, model_path)
     else:
+        raise ValueError(f"Unknown task type: {task}")