Spaces:

nikigoli
/

countgd

Running on T4

App Files Files Community

Prasanna Sridhar commited on Nov 7, 2024

Commit

346623e

1 Parent(s): c469934

remove unused imports

Browse files

Files changed (1) hide show

app.py +19 -25

app.py CHANGED Viewed

@@ -1,16 +1,10 @@
 import spaces
 import gradio as gr
-import copy
 import random
 import torch
-import PIL
-from PIL import Image, ImageDraw, ImageFont
-import torchvision.transforms.functional as F
 import numpy as np
 import argparse
-import json
-import plotly.express as px
-import pandas as pd
 from util.slconfig import SLConfig, DictAction
 from util.misc import nested_tensor_from_tensor_list
 import datasets.transforms as T
@@ -258,14 +252,14 @@ def get_ind_to_filter(text, word_ids, keywords):
 def count(image, text, prompts, state, device):
     keywords = "" # do not handle this for now
     # Handle no prompt case.
     if prompts is None:
         prompts = {"image": image, "points": []}
     input_image, _ = transform(image, {"exemplars": torch.tensor([])})
     input_image = input_image.unsqueeze(0).to(device)
     exemplars = get_box_inputs(prompts["points"])
     input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
     input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
     exemplars = [exemplars["exemplars"].to(device)]
@@ -278,7 +272,7 @@ def count(image, text, prompts, state, device):
                 [torch.tensor([0]).to(device) for _ in range(len(input_image))],
                 captions=[text + " ."] * len(input_image),
             )
     ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
     logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
     boxes = model_output["pred_boxes"][0]
@@ -288,7 +282,7 @@ def count(image, text, prompts, state, device):
         box_mask = logits.max(dim=-1).values > CONF_THRESH
     logits = logits[box_mask, :].cpu().numpy()
     boxes = boxes[box_mask, :].cpu().numpy()
     # Plot results.
     (w, h) = image.size
     det_map = np.zeros((h, w))
@@ -327,7 +321,7 @@ def count(image, text, prompts, state, device):
     if len(text.strip()) > 0:
         out_label += " text"
         if exemplars[0].size()[0] == 1:
-            out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
         elif exemplars[0].size()[0] > 1:
             out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
         else:
@@ -339,7 +333,7 @@ def count(image, text, prompts, state, device):
             out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
     else:
         out_label = "Nothing specified to detect."
     return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]), new_submit_btn, gr.Tab(visible=True), step_3, state)
 @spaces.GPU
@@ -351,11 +345,11 @@ def count_main(image, text, prompts, device):
     input_image, _ = transform(image, {"exemplars": torch.tensor([])})
     input_image = input_image.unsqueeze(0).to(device)
     exemplars = get_box_inputs(prompts["points"])
     input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
     input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
     exemplars = [exemplars["exemplars"].to(device)]
     with torch.no_grad():
         model_output = model(
                 nested_tensor_from_tensor_list(input_image),
@@ -364,7 +358,7 @@ def count_main(image, text, prompts, device):
                 [torch.tensor([0]).to(device) for _ in range(len(input_image))],
                 captions=[text + " ."] * len(input_image),
             )
     ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
     logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
     boxes = model_output["pred_boxes"][0]
@@ -374,7 +368,7 @@ def count_main(image, text, prompts, device):
         box_mask = logits.max(dim=-1).values > CONF_THRESH
     logits = logits[box_mask, :].cpu().numpy()
     boxes = boxes[box_mask, :].cpu().numpy()
     # Plot results.
     (w, h) = image.size
     det_map = np.zeros((h, w))
@@ -395,7 +389,7 @@ def count_main(image, text, prompts, device):
     if len(text.strip()) > 0:
         out_label += " text"
         if exemplars[0].size()[0] == 1:
-            out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
         elif exemplars[0].size()[0] > 1:
             out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
         else:
@@ -407,7 +401,7 @@ def count_main(image, text, prompts, device):
             out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
     else:
         out_label = "Nothing specified to detect."
     return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]))
 def remove_label(image):
@@ -452,20 +446,20 @@ with gr.Blocks(title="CountGD: Multi-Modal Open-World Counting", theme="soft", h
                 with gr.Tab("Step 1", visible=True) as step_1:
                     input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False, width="30vw")
                     gr.Markdown('# Click "Count" to count the strawberries.')
             with gr.Column():
                 with gr.Tab("Output Image"):
                     detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True, width="40vw")
         with gr.Row():
             input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)
             pred_count = gr.Number(label="Predicted Count", visible=False)
         submit_btn = gr.Button("Count", variant="primary", interactive=True)
         submit_btn.click(fn=remove_label, inputs=[detected_instances], outputs=[detected_instances]).then(fn=count, inputs=[input_image, input_text, exemplar_image, state, device], outputs=[detected_instances, pred_count, submit_btn, step_2, step_3, state])
         exemplar_image.change(check_submit_btn, inputs=[exemplar_image, state], outputs=[submit_btn])
     with gr.Tab("App", visible=True) as main_app:
         gr.Markdown(
               """
               # <center>CountGD: Multi-Modal Open-World Counting
@@ -476,7 +470,7 @@ with gr.Blocks(title="CountGD: Multi-Modal Open-World Counting", theme="soft", h
               Limitation: this app does not support fine-grained counting based on attributes or visual grounding inputs yet. Note: if the exemplar and text conflict each other, both will be counted.</center>
               """
             )
         with gr.Row():
             with gr.Column():
               input_image_main = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=True)
@@ -490,6 +484,6 @@ with gr.Blocks(title="CountGD: Multi-Modal Open-World Counting", theme="soft", h
         gr.Examples(label="Examples: click on a row to load the example. Add visual exemplars by drawing boxes on the loaded \"Visual Exemplar Image.\"", examples=examples, inputs=[input_image_main, input_text_main, exemplar_image_main])
         submit_btn_main.click(fn=remove_label, inputs=[detected_instances_main], outputs=[detected_instances_main]).then(fn=count_main, inputs=[input_image_main, input_text_main, exemplar_image_main, device], outputs=[detected_instances_main, pred_count_main])
         clear_btn_main.add([input_image_main, input_text_main, exemplar_image_main, detected_instances_main, pred_count_main])
 demo.queue().launch(allowed_paths=['back-icon.jpg', 'paste-icon.jpg', 'upload-icon.jpg', 'button-legend.jpg'])

 import spaces
 import gradio as gr
 import random
 import torch
+from PIL import Image
 import numpy as np
 import argparse
 from util.slconfig import SLConfig, DictAction
 from util.misc import nested_tensor_from_tensor_list
 import datasets.transforms as T
 def count(image, text, prompts, state, device):
     keywords = "" # do not handle this for now
     # Handle no prompt case.
     if prompts is None:
         prompts = {"image": image, "points": []}
     input_image, _ = transform(image, {"exemplars": torch.tensor([])})
     input_image = input_image.unsqueeze(0).to(device)
     exemplars = get_box_inputs(prompts["points"])
     input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
     input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
     exemplars = [exemplars["exemplars"].to(device)]
                 [torch.tensor([0]).to(device) for _ in range(len(input_image))],
                 captions=[text + " ."] * len(input_image),
             )
     ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
     logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
     boxes = model_output["pred_boxes"][0]
         box_mask = logits.max(dim=-1).values > CONF_THRESH
     logits = logits[box_mask, :].cpu().numpy()
     boxes = boxes[box_mask, :].cpu().numpy()
     # Plot results.
     (w, h) = image.size
     det_map = np.zeros((h, w))
     if len(text.strip()) > 0:
         out_label += " text"
         if exemplars[0].size()[0] == 1:
+            out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
         elif exemplars[0].size()[0] > 1:
             out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
         else:
             out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
     else:
         out_label = "Nothing specified to detect."
     return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]), new_submit_btn, gr.Tab(visible=True), step_3, state)
 @spaces.GPU
     input_image, _ = transform(image, {"exemplars": torch.tensor([])})
     input_image = input_image.unsqueeze(0).to(device)
     exemplars = get_box_inputs(prompts["points"])
     input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
     input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
     exemplars = [exemplars["exemplars"].to(device)]
     with torch.no_grad():
         model_output = model(
                 nested_tensor_from_tensor_list(input_image),
                 [torch.tensor([0]).to(device) for _ in range(len(input_image))],
                 captions=[text + " ."] * len(input_image),
             )
     ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
     logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
     boxes = model_output["pred_boxes"][0]
         box_mask = logits.max(dim=-1).values > CONF_THRESH
     logits = logits[box_mask, :].cpu().numpy()
     boxes = boxes[box_mask, :].cpu().numpy()
     # Plot results.
     (w, h) = image.size
     det_map = np.zeros((h, w))
     if len(text.strip()) > 0:
         out_label += " text"
         if exemplars[0].size()[0] == 1:
+            out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
         elif exemplars[0].size()[0] > 1:
             out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
         else:
             out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
     else:
         out_label = "Nothing specified to detect."
     return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]))
 def remove_label(image):
                 with gr.Tab("Step 1", visible=True) as step_1:
                     input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False, width="30vw")
                     gr.Markdown('# Click "Count" to count the strawberries.')
             with gr.Column():
                 with gr.Tab("Output Image"):
                     detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True, width="40vw")
         with gr.Row():
             input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)
             pred_count = gr.Number(label="Predicted Count", visible=False)
         submit_btn = gr.Button("Count", variant="primary", interactive=True)
         submit_btn.click(fn=remove_label, inputs=[detected_instances], outputs=[detected_instances]).then(fn=count, inputs=[input_image, input_text, exemplar_image, state, device], outputs=[detected_instances, pred_count, submit_btn, step_2, step_3, state])
         exemplar_image.change(check_submit_btn, inputs=[exemplar_image, state], outputs=[submit_btn])
     with gr.Tab("App", visible=True) as main_app:
         gr.Markdown(
               """
               # <center>CountGD: Multi-Modal Open-World Counting
               Limitation: this app does not support fine-grained counting based on attributes or visual grounding inputs yet. Note: if the exemplar and text conflict each other, both will be counted.</center>
               """
             )
         with gr.Row():
             with gr.Column():
               input_image_main = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=True)
         gr.Examples(label="Examples: click on a row to load the example. Add visual exemplars by drawing boxes on the loaded \"Visual Exemplar Image.\"", examples=examples, inputs=[input_image_main, input_text_main, exemplar_image_main])
         submit_btn_main.click(fn=remove_label, inputs=[detected_instances_main], outputs=[detected_instances_main]).then(fn=count_main, inputs=[input_image_main, input_text_main, exemplar_image_main, device], outputs=[detected_instances_main, pred_count_main])
         clear_btn_main.add([input_image_main, input_text_main, exemplar_image_main, detected_instances_main, pred_count_main])
 demo.queue().launch(allowed_paths=['back-icon.jpg', 'paste-icon.jpg', 'upload-icon.jpg', 'button-legend.jpg'])