Spaces:

not-lain
/

gpu-utils

Paused

App Files Files Community

not-lain commited on Mar 28

Commit

5919468

1 Parent(s): ffa1375

remove sam2

Browse files

Files changed (2) hide show

app.py +101 -54
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -3,11 +3,11 @@ import spaces
 import torch
 from loadimg import load_img
 from torchvision import transforms
-from transformers import AutoModelForImageSegmentation
 from diffusers import FluxFillPipeline
 from PIL import Image, ImageOps
-from sam2.sam2_image_predictor import SAM2ImagePredictor
 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
@@ -134,36 +134,36 @@ def rmbg(image=None, url=None):
     return image
-def mask_generation(image=None, d=None):
-    # use bfloat16 for the entire notebook
-    # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
-    # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
-    # if torch.cuda.get_device_properties(0).major >= 8:
-    #     torch.backends.cuda.matmul.allow_tf32 = True
-    #     torch.backends.cudnn.allow_tf32 = True
-    d = eval(d)  # convert this to dictionary
-    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
-        predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-large")
-        predictor.set_image(image)
-        input_point = np.array(d["input_points"])
-        input_label = np.array(d["input_labels"])
-        masks, scores, logits = predictor.predict(
-            point_coords=input_point,
-            point_labels=input_label,
-            multimask_output=True,
-        )
-    sorted_ind = np.argsort(scores)[::-1]
-    masks = masks[sorted_ind]
-    scores = scores[sorted_ind]
-    logits = logits[sorted_ind]
-    out = []
-    for i in range(len(masks)):
-        m = Image.fromarray(masks[i] * 255).convert("L")
-        comp = Image.composite(image, m, m)
-        out.append((comp, f"image {i}"))
-    return out
 def erase(image=None, mask=None):
@@ -173,6 +173,25 @@ def erase(image=None, mask=None):
     return simple_lama(image, mask)
 @spaces.GPU(duration=120)
 def main(*args):
     api_num = args[0]
@@ -183,10 +202,12 @@ def main(*args):
         return outpaint(*args)
     elif api_num == 3:
         return inpaint(*args)
-    elif api_num == 4:
-        return mask_generation(*args)
     elif api_num == 5:
         return erase(*args)
 rmbg_tab = gr.Interface(
@@ -241,24 +262,24 @@ inpaint_tab = gr.Interface(
 )
-sam2_tab = gr.Interface(
-    main,
-    inputs=[
-        gr.Number(4, interactive=False),
-        gr.Image(type="pil"),
-        gr.Text(),
-    ],
-    outputs=gr.Gallery(),
-    examples=[
-        [
-            4,
-            "./assets/truck.jpg",
-            '{"input_points": [[500, 375], [1125, 625]], "input_labels": [1, 0]}',
-        ]
-    ],
-    api_name="sam2",
-    cache_examples=False,
-)
 erase_tab = gr.Interface(
     main,
@@ -279,9 +300,35 @@ erase_tab = gr.Interface(
     cache_examples=False,
 )
 demo = gr.TabbedInterface(
-    [rmbg_tab, outpaint_tab, inpaint_tab, sam2_tab, erase_tab],
-    ["remove background", "outpainting", "inpainting", "sam2", "erase"],
     title="Utilities that require GPU",
 )

 import torch
 from loadimg import load_img
 from torchvision import transforms
+from transformers import AutoModelForImageSegmentation, pipeline
 from diffusers import FluxFillPipeline
 from PIL import Image, ImageOps
+# from sam2.sam2_image_predictor import SAM2ImagePredictor
 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
     return image
+# def mask_generation(image=None, d=None):
+#     # use bfloat16 for the entire notebook
+#     # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
+#     # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+#     # if torch.cuda.get_device_properties(0).major >= 8:
+#     #     torch.backends.cuda.matmul.allow_tf32 = True
+#     #     torch.backends.cudnn.allow_tf32 = True
+#     d = eval(d)  # convert this to dictionary
+#     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+#         predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-large")
+#         predictor.set_image(image)
+#         input_point = np.array(d["input_points"])
+#         input_label = np.array(d["input_labels"])
+#         masks, scores, logits = predictor.predict(
+#             point_coords=input_point,
+#             point_labels=input_label,
+#             multimask_output=True,
+#         )
+#     sorted_ind = np.argsort(scores)[::-1]
+#     masks = masks[sorted_ind]
+#     scores = scores[sorted_ind]
+#     logits = logits[sorted_ind]
+#     out = []
+#     for i in range(len(masks)):
+#         m = Image.fromarray(masks[i] * 255).convert("L")
+#         comp = Image.composite(image, m, m)
+#         out.append((comp, f"image {i}"))
+#     return out
 def erase(image=None, mask=None):
     return simple_lama(image, mask)
+# Initialize Whisper model
+whisper = pipeline(
+    task="automatic-speech-recognition",
+    model="openai/whisper-large-v3",
+    chunk_length_s=30,
+    device="cuda" if torch.cuda.is_available() else "cpu",
+)
+def transcribe(audio, task="transcribe"):
+    if audio is None:
+        raise gr.Error("No audio file submitted!")
+    text = whisper(
+        audio, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True
+    )["text"]
+    return text
 @spaces.GPU(duration=120)
 def main(*args):
     api_num = args[0]
         return outpaint(*args)
     elif api_num == 3:
         return inpaint(*args)
+    # elif api_num == 4:
+    #     return mask_generation(*args)
     elif api_num == 5:
         return erase(*args)
+    elif api_num == 6:
+        return transcribe(*args)
 rmbg_tab = gr.Interface(
 )
+# sam2_tab = gr.Interface(
+#     main,
+#     inputs=[
+#         gr.Number(4, interactive=False),
+#         gr.Image(type="pil"),
+#         gr.Text(),
+#     ],
+#     outputs=gr.Gallery(),
+#     examples=[
+#         [
+#             4,
+#             "./assets/truck.jpg",
+#             '{"input_points": [[500, 375], [1125, 625]], "input_labels": [1, 0]}',
+#         ]
+#     ],
+#     api_name="sam2",
+#     cache_examples=False,
+# )
 erase_tab = gr.Interface(
     main,
     cache_examples=False,
 )
+transcribe_tab = gr.Interface(
+    fn=main,
+    inputs=[
+        gr.Number(6, interactive=False),
+        gr.Audio(source="upload", type="filepath"),
+        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+    ],
+    outputs="text",
+    api_name="transcribe",
+    description="Upload an audio file to extract text using Whisper Large V3",
+)
 demo = gr.TabbedInterface(
+    [
+        rmbg_tab,
+        outpaint_tab,
+        inpaint_tab,
+        #  sam2_tab,
+        erase_tab,
+        transcribe_tab,
+    ],
+    [
+        "remove background",
+        "outpainting",
+        "inpainting",
+        #  "sam2",
+        "erase",
+        "transcribe",
+    ],
     title="Utilities that require GPU",
 )

requirements.txt CHANGED Viewed

@@ -19,6 +19,6 @@ kornia
 huggingface_hub
 sentencepiece
 einops
-git+https://github.com/facebookresearch/sam2.git
 matplotlib
 simple-lama-inpainting

 huggingface_hub
 sentencepiece
 einops
+# git+https://github.com/facebookresearch/sam2.git
 matplotlib
 simple-lama-inpainting