Spaces:

nikigoli
/

countgd

Sleeping

App Files Files Community

Prasanna Sridhar commited on Jan 21

Commit

aedd89b

1 Parent(s): 2f1d1a1

Refactor app.py - extract reusable functions

Browse files

Files changed (3) hide show

.gitignore +2 -2
app.py +116 -148
requirements.txt +2 -0

.gitignore CHANGED Viewed

@@ -2,7 +2,7 @@
 env/
 __pycache__
 .python-version
 # vim
-*.sw[op]

 env/
 __pycache__
 .python-version
+*.py[od]
 # vim
+*.sw[op]

app.py CHANGED Viewed

@@ -14,11 +14,6 @@ import matplotlib.pyplot as plt
 import io
 from enum import Enum
 import os
-import subprocess
-from subprocess import call
-import shlex
-import shutil
-#os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.getcwd(), "tmp")
 cwd = os.getcwd()
 # Suppress warnings to avoid overflowing the log.
 import warnings
@@ -145,22 +140,6 @@ def build_model_and_transforms(args):
     return model, data_transform
-examples = [
-    ["strawberry.jpg", "strawberry", {"image": "strawberry.jpg"}],
-    ["strawberry.jpg", "blueberry", {"image": "strawberry.jpg"}],
-    ["bird-1.JPG", "bird", {"image": "bird-2.JPG"}],
-    ["fish.jpg", "fish", {"image": "fish.jpg"}],
-    ["women.jpg", "girl", {"image": "women.jpg"}],
-    ["women.jpg", "boy", {"image": "women.jpg"}],
-    ["balloon.jpg", "hot air balloon", {"image": "balloon.jpg"}],
-    ["deer.jpg", "deer", {"image": "deer.jpg"}],
-    ["apple.jpg", "apple", {"image": "apple.jpg"}],
-    ["egg.jpg", "egg", {"image": "egg.jpg"}],
-    ["stamp.jpg", "stamp", {"image": "stamp.jpg"}],
-    ["green-pea.jpg", "green pea", {"image": "green-pea.jpg"}],
-    ["lego.jpg", "lego", {"image": "lego.jpg"}]
-]
 # APP:
 def get_box_inputs(prompts):
     box_inputs = []
@@ -197,6 +176,107 @@ def get_ind_to_filter(text, word_ids, keywords):
     return inds_to_filter
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("Counting Application", parents=[get_args_parser()])
@@ -207,54 +287,15 @@ if __name__ == '__main__':
     @spaces.GPU(duration=120)
     def count(image, text, prompts, state, device):
-        keywords = "" # do not handle this for now
-        # Handle no prompt case.
         if prompts is None:
             prompts = {"image": image, "points": []}
-        input_image, _ = transform(image, {"exemplars": torch.tensor([])})
-        input_image = input_image.unsqueeze(0).to(device)
-        exemplars = get_box_inputs(prompts["points"])
-        input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
-        input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
-        exemplars = [exemplars["exemplars"].to(device)]
-        with torch.no_grad():
-            model_output = model(
-                    nested_tensor_from_tensor_list(input_image),
-                    nested_tensor_from_tensor_list(input_image_exemplars),
-                    exemplars,
-                    [torch.tensor([0]).to(device) for _ in range(len(input_image))],
-                    captions=[text + " ."] * len(input_image),
-                )
-        ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
-        logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
-        boxes = model_output["pred_boxes"][0]
-        if len(keywords.strip()) > 0:
-            box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
-        else:
-            box_mask = logits.max(dim=-1).values > CONF_THRESH
-        logits = logits[box_mask, :].cpu().numpy()
-        boxes = boxes[box_mask, :].cpu().numpy()
-        # Plot results.
-        (w, h) = image.size
-        det_map = np.zeros((h, w))
-        det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
-        det_map = ndimage.gaussian_filter(
-            det_map, sigma=(w // 200, w // 200), order=0
-        )
-        plt.imshow(image)
-        plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
-        plt.axis('off')
-        img_buf = io.BytesIO()
-        plt.savefig(img_buf, format='png', bbox_inches='tight')
-        plt.close()
-        output_img = Image.open(img_buf)
         if AppSteps.TEXT_AND_EXEMPLARS not in state:
             exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
@@ -274,92 +315,19 @@ if __name__ == '__main__':
             main_instructions_comp = gr.Markdown(visible=True)
             step_3 = gr.Tab(visible=True)
-        out_label = "Detected instances predicted with"
-        if len(text.strip()) > 0:
-            out_label += " text"
-            if exemplars[0].size()[0] == 1:
-                out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
-            elif exemplars[0].size()[0] > 1:
-                out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
-            else:
-                out_label += "."
-        elif exemplars[0].size()[0] > 0:
-            if exemplars[0].size()[0] == 1:
-                out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
-            else:
-                out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
-        else:
-            out_label = "Nothing specified to detect."
-        return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]), new_submit_btn, gr.Tab(visible=True), step_3, state)
     @spaces.GPU
     def count_main(image, text, prompts, device):
-        keywords = "" # do not handle this for now
-        # Handle no prompt case.
         if prompts is None:
             prompts = {"image": image, "points": []}
-        input_image, _ = transform(image, {"exemplars": torch.tensor([])})
-        input_image = input_image.unsqueeze(0).to(device)
-        exemplars = get_box_inputs(prompts["points"])
-        input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
-        input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
-        exemplars = [exemplars["exemplars"].to(device)]
-        with torch.no_grad():
-            model_output = model(
-                    nested_tensor_from_tensor_list(input_image),
-                    nested_tensor_from_tensor_list(input_image_exemplars),
-                    exemplars,
-                    [torch.tensor([0]).to(device) for _ in range(len(input_image))],
-                    captions=[text + " ."] * len(input_image),
-                )
-        ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
-        logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
-        boxes = model_output["pred_boxes"][0]
-        if len(keywords.strip()) > 0:
-            box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
-        else:
-            box_mask = logits.max(dim=-1).values > CONF_THRESH
-        logits = logits[box_mask, :].cpu().numpy()
-        boxes = boxes[box_mask, :].cpu().numpy()
-        # Plot results.
-        (w, h) = image.size
-        det_map = np.zeros((h, w))
-        det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
-        det_map = ndimage.gaussian_filter(
-            det_map, sigma=(w // 200, w // 200), order=0
-        )
-        plt.imshow(image)
-        plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
-        plt.axis('off')
-        img_buf = io.BytesIO()
-        plt.savefig(img_buf, format='png', bbox_inches='tight')
-        plt.close()
-        output_img = Image.open(img_buf)
-        out_label = "Detected instances predicted with"
-        if len(text.strip()) > 0:
-            out_label += " text"
-            if exemplars[0].size()[0] == 1:
-                out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
-            elif exemplars[0].size()[0] > 1:
-                out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
-            else:
-                out_label += "."
-        elif exemplars[0].size()[0] > 0:
-            if exemplars[0].size()[0] == 1:
-                out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
-            else:
-                out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
-        else:
-            out_label = "Nothing specified to detect."
-        return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]))
     def remove_label(image):
         return gr.Image(show_label=False)
@@ -401,12 +369,12 @@ if __name__ == '__main__':
                         with gr.Accordion("Open for Further Information", open=False):
                             gr.Markdown(exemplar_img_drawing_instructions_part_2)
                     with gr.Tab("Step 1", visible=True) as step_1:
-                        input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False, width="30vw")
                         gr.Markdown('# Click "Count" to count the strawberries.')
                 with gr.Column():
                     with gr.Tab("Output Image"):
-                        detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True, width="40vw")
             with gr.Row():
                 input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)

 import io
 from enum import Enum
 import os
 cwd = os.getcwd()
 # Suppress warnings to avoid overflowing the log.
 import warnings
     return model, data_transform
 # APP:
 def get_box_inputs(prompts):
     box_inputs = []
     return inds_to_filter
+def generate_heatmap(image, boxes):
+    # Plot results.
+    (w, h) = image.size
+    det_map = np.zeros((h, w))
+    det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
+    det_map = ndimage.gaussian_filter(
+        det_map, sigma=(w // 200, w // 200), order=0
+    )
+    plt.imshow(image)
+    plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
+    plt.axis('off')
+    img_buf = io.BytesIO()
+    plt.savefig(img_buf, format='png', bbox_inches='tight')
+    plt.close()
+    output_img = Image.open(img_buf)
+    return output_img
+def generate_output_label(text, num_exemplars):
+    out_label = "Detected instances predicted with"
+    if len(text.strip()) > 0:
+        out_label += " text"
+        if num_exemplars == 1:
+            out_label += " and " + str(num_exemplars) + " visual exemplar."
+        elif num_exemplars > 1:
+            out_label += " and " + str(num_exemplars) + " visual exemplars."
+        else:
+            out_label += "."
+    elif num_exemplars > 0:
+        if num_exemplars == 1:
+            out_label += " " + str(num_exemplars) + " visual exemplar."
+        else:
+            out_label += " " + str(num_exemplars) + " visual exemplars."
+    else:
+        out_label = "Nothing specified to detect."
+    return out_label
+def preprocess(image, input_prompts = None):
+    if input_prompts == None:
+        prompts = { "image": image, "points": []}
+    else:
+        prompts = input_prompts
+    input_image, _ = transform(image, None)
+    exemplar = get_box_inputs(prompts["points"])
+    # Wrapping exemplar in a dictionary to apply only relevant transforms
+    input_image_exemplar, exemplar = transform(prompts['image'], {"exemplars": torch.tensor(exemplar)})
+    exemplar = exemplar["exemplars"]
+    return input_image, input_image_exemplar, exemplar
+def get_boxes_from_prediction(model_output, text, keywords = ""):
+    ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
+    logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
+    boxes = model_output["pred_boxes"][0]
+    if len(keywords.strip()) > 0:
+        box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
+    else:
+        box_mask = logits.max(dim=-1).values > CONF_THRESH
+    boxes = boxes[box_mask, :].cpu().numpy()
+    logits = logits[box_mask, :].cpu().numpy()
+    return boxes, logits
+def predict(model, image, text, prompts, device):
+    keywords = "" # do not handle this for now
+    input_image, input_image_exemplar, exemplar = preprocess(image, prompts)
+    input_images = input_image.unsqueeze(0).to(device)
+    input_image_exemplars = input_image_exemplar.unsqueeze(0).to(device)
+    exemplars = [exemplar.to(device)]
+    with torch.no_grad():
+        model_output = model(
+                nested_tensor_from_tensor_list(input_images),
+                nested_tensor_from_tensor_list(input_image_exemplars),
+                exemplars,
+                [torch.tensor([0]).to(device) for _ in range(len(input_images))],
+                captions=[text + " ."] * len(input_images),
+            )
+    keywords = ""
+    return get_boxes_from_prediction(model_output, text, keywords)
+examples = [
+    ["strawberry.jpg", "strawberry", {"image": "strawberry.jpg"}],
+    ["strawberry.jpg", "blueberry", {"image": "strawberry.jpg"}],
+    ["bird-1.JPG", "bird", {"image": "bird-2.JPG"}],
+    ["fish.jpg", "fish", {"image": "fish.jpg"}],
+    ["women.jpg", "girl", {"image": "women.jpg"}],
+    ["women.jpg", "boy", {"image": "women.jpg"}],
+    ["balloon.jpg", "hot air balloon", {"image": "balloon.jpg"}],
+    ["deer.jpg", "deer", {"image": "deer.jpg"}],
+    ["apple.jpg", "apple", {"image": "apple.jpg"}],
+    ["egg.jpg", "egg", {"image": "egg.jpg"}],
+    ["stamp.jpg", "stamp", {"image": "stamp.jpg"}],
+    ["green-pea.jpg", "green pea", {"image": "green-pea.jpg"}],
+    ["lego.jpg", "lego", {"image": "lego.jpg"}]
+]
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("Counting Application", parents=[get_args_parser()])
     @spaces.GPU(duration=120)
     def count(image, text, prompts, state, device):
         if prompts is None:
             prompts = {"image": image, "points": []}
+        boxes, _ = predict(model, image, text, prompts, device)
+        count = len(boxes)
+        output_img = generate_heatmap(image, boxes)
+        num_exemplars = len(get_box_inputs(prompts["points"]))
+        out_label = generate_output_label(text, num_exemplars)
         if AppSteps.TEXT_AND_EXEMPLARS not in state:
             exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
             main_instructions_comp = gr.Markdown(visible=True)
             step_3 = gr.Tab(visible=True)
+        return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=count), new_submit_btn, gr.Tab(visible=True), step_3, state)
     @spaces.GPU
     def count_main(image, text, prompts, device):
         if prompts is None:
             prompts = {"image": image, "points": []}
+        boxes, _ = predict(model, image, text, prompts, device)
+        count = len(boxes)
+        output_img = generate_heatmap(image, boxes)
+        num_exemplars = len(get_box_inputs(prompts["points"]))
+        out_label = generate_output_label(text, num_exemplars)
+        return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=count))
     def remove_label(image):
         return gr.Image(show_label=False)
                         with gr.Accordion("Open for Further Information", open=False):
                             gr.Markdown(exemplar_img_drawing_instructions_part_2)
                     with gr.Tab("Step 1", visible=True) as step_1:
+                        input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False)
                         gr.Markdown('# Click "Count" to count the strawberries.')
                 with gr.Column():
                     with gr.Tab("Output Image"):
+                        detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True)
             with gr.Row():
                 input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)

requirements.txt CHANGED Viewed

@@ -12,6 +12,8 @@ ushlex
 gradio>=4.0.0,<5
 gradio_image_prompter-0.1.0-py3-none-any.whl
 spaces
 --extra-index-url https://download.pytorch.org/whl/cu121
 torch<2.6
 torchvision

 gradio>=4.0.0,<5
 gradio_image_prompter-0.1.0-py3-none-any.whl
 spaces
+filetype
+tqdm
 --extra-index-url https://download.pytorch.org/whl/cu121
 torch<2.6
 torchvision