Spaces:

SixOpen
/

Florence-2-large-ft

Running on Zero

App Files Files Community

SixOpen commited on Dec 29, 2024

Commit

664d48c

verified ·

1 Parent(s): a4e5a42

Update app.py

Browse files

Cleanup + cuda init update

Files changed (1) hide show

app.py +46 -13

app.py CHANGED Viewed

@@ -21,16 +21,33 @@ def workaround_fixed_get_imports(filename: str | os.PathLike) -> list[str]:
     imports.remove("flash_attn")
     return imports
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-with patch("transformers.dynamic_module_utils.get_imports", workaround_fixed_get_imports):
-    model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True).to(device).eval()
-    processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True)
-colormap = ['blue', 'orange', 'green', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'red',
-            'lime', 'indigo', 'violet', 'aqua', 'magenta', 'coral', 'gold', 'tan', 'skyblue']
 def run_example(task_prompt, image, text_input=None):
     prompt = task_prompt if text_input is None else task_prompt + text_input
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
     with torch.inference_mode():
@@ -38,6 +55,9 @@ def run_example(task_prompt, image, text_input=None):
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     return processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.size[0], image.size[1]))
 def fig_to_pil(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
@@ -85,7 +105,7 @@ def draw_ocr_bboxes(image, prediction):
     bboxes, labels = prediction['quad_boxes'], prediction['labels']
     for box, label in zip(bboxes, labels):
         color = random.choice(colormap)
-        box_array = np.array(box).reshape(-1, 2)  # respect format
         polygon = patches.Polygon(box_array, edgecolor=color, fill=False, linewidth=2)
         ax.add_patch(polygon)
         plt.text(box_array[0, 0], box_array[0, 1], label, color='white', fontsize=10, bbox=dict(facecolor=color, alpha=0.8))
@@ -101,7 +121,7 @@ def plot_bbox(image, data):
         draw.text((x1, y1), label, fill="white")
     return np.array(img_draw)
-@spaces.GPU(duration=130) #remains to be seen, increasing too much may leave people queueing for long
 def process_video(input_video_path, task_prompt):
     cap = cv2.VideoCapture(input_video_path)
     if not cap.isOpened():
@@ -118,7 +138,7 @@ def process_video(input_video_path, task_prompt):
     processed_frames = 0
     frame_results = []
-    color_map = {}  #consistency for chromakey possibility
     def get_color(label):
         if label not in color_map:
@@ -229,6 +249,10 @@ def process_video_p(input_video, task, text_input):
         return None, "Error: Video processing failed. Check logs above for info.", str(frame_results)
     return result, result, str(frame_results)
 with gr.Blocks() as demo:
     gr.HTML("<h1><center>Microsoft Florence-2-large-ft</center></h1>")
@@ -300,7 +324,16 @@ with gr.Blocks() as demo:
     video_task_dropdown.change(fn=update_video_text_input, inputs=video_task_dropdown, outputs=video_text_input)
-    submit_btn.click(fn=process_image, inputs=[input_img, task_dropdown, text_input], outputs=[output_text, output_image])
-    video_submit_btn.click(fn=process_video_p, inputs=[input_video, video_task_dropdown, video_text_input], outputs=[output_video, output_video, frame_results_output])
 demo.launch()

     imports.remove("flash_attn")
     return imports
+def load_model():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    with patch("transformers.dynamic_module_utils.get_imports", workaround_fixed_get_imports):
+        model = AutoModelForCausalLM.from_pretrained(
+            "microsoft/Florence-2-large-ft",
+            trust_remote_code=True
+        ).to(device).eval()
+        processor = AutoProcessor.from_pretrained(
+            "microsoft/Florence-2-large-ft",
+            trust_remote_code=True
+        )
+    return model, processor, device
+model = None
+processor = None
+device = None
+@spaces.GPU
+def initialize_model():
+    global model, processor, device
+    model, processor, device = load_model()
 def run_example(task_prompt, image, text_input=None):
+    global model, processor, device
+    if model is None or processor is None:
+        initialize_model()
     prompt = task_prompt if text_input is None else task_prompt + text_input
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
     with torch.inference_mode():
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     return processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.size[0], image.size[1]))
+colormap = ['blue', 'orange', 'green', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'red',
+            'lime', 'indigo', 'violet', 'aqua', 'magenta', 'coral', 'gold', 'tan', 'skyblue']
 def fig_to_pil(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
     bboxes, labels = prediction['quad_boxes'], prediction['labels']
     for box, label in zip(bboxes, labels):
         color = random.choice(colormap)
+        box_array = np.array(box).reshape(-1, 2)
         polygon = patches.Polygon(box_array, edgecolor=color, fill=False, linewidth=2)
         ax.add_patch(polygon)
         plt.text(box_array[0, 0], box_array[0, 1], label, color='white', fontsize=10, bbox=dict(facecolor=color, alpha=0.8))
         draw.text((x1, y1), label, fill="white")
     return np.array(img_draw)
+@spaces.GPU
 def process_video(input_video_path, task_prompt):
     cap = cv2.VideoCapture(input_video_path)
     if not cap.isOpened():
     processed_frames = 0
     frame_results = []
+    color_map = {}
     def get_color(label):
         if label not in color_map:
         return None, "Error: Video processing failed. Check logs above for info.", str(frame_results)
     return result, result, str(frame_results)
+@spaces.GPU
+def process_image_with_gpu(image, task, text):
+    return process_image(image, task, text)
 with gr.Blocks() as demo:
     gr.HTML("<h1><center>Microsoft Florence-2-large-ft</center></h1>")
     video_task_dropdown.change(fn=update_video_text_input, inputs=video_task_dropdown, outputs=video_text_input)
+    submit_btn.click(
+        fn=process_image_with_gpu,
+        inputs=[input_img, task_dropdown, text_input],
+        outputs=[output_text, output_image]
+    )
+    video_submit_btn.click(
+        fn=process_video_p,
+        inputs=[input_video, video_task_dropdown, video_text_input],
+        outputs=[output_video, output_video, frame_results_output]
+    )
 demo.launch()