Spaces:

huzey
/

ncut-pytorch

Running on Zero

App Files Files Community

huzey commited on Aug 26, 2024

Commit

caaf478

1 Parent(s): b26090c

update gpu

Browse files

Files changed (1) hide show

app.py +44 -67

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ from einops import rearrange
 import torch
 import torch.nn.functional as F
 from PIL import Image
-import torchvision.transforms as transforms
 from torch import nn
 import numpy as np
 import os
@@ -17,6 +16,15 @@ USE_CUDA = torch.cuda.is_available()
 print("CUDA is available:", USE_CUDA)
 class MobileSAM(nn.Module):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -139,19 +147,12 @@ mobilesam = MobileSAM()
 def image_mobilesam_feature(
     images,
-    resolution=(1024, 1024),
     node_type="block",
     layer=-1,
 ):
-    transform = transforms.Compose(
-        [
-            transforms.Resize(resolution),
-            transforms.ToTensor(),
-            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
     feat_extractor = mobilesam
     if USE_CUDA:
@@ -159,12 +160,9 @@ def image_mobilesam_feature(
     # attn_outputs, mlp_outputs, block_outputs = [], [], []
     outputs = []
-    for i, image in enumerate(images):
-        torch_image = transform(image)
-        if USE_CUDA:
-            torch_image = torch_image.cuda()
         attn_output, mlp_output, block_output = feat_extractor(
-            torch_image.unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
@@ -251,18 +249,12 @@ sam = SAM()
 def image_sam_feature(
     images,
-    resolution=(1024, 1024),
     node_type="block",
     layer=-1,
 ):
-    transform = transforms.Compose(
-        [
-            transforms.Resize(resolution),
-            transforms.ToTensor(),
-            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
     feat_extractor = sam
     if USE_CUDA:
@@ -270,12 +262,9 @@ def image_sam_feature(
     # attn_outputs, mlp_outputs, block_outputs = [], [], []
     outputs = []
-    for i, image in enumerate(images):
-        torch_image = transform(image)
-        if USE_CUDA:
-            torch_image = torch_image.cuda()
         attn_output, mlp_output, block_output = feat_extractor(
-            torch_image.unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
@@ -338,27 +327,20 @@ class DiNOv2(torch.nn.Module):
 dinov2 = DiNOv2()
-def image_dino_feature(images, resolution=(448, 448), node_type="block", layer=-1):
-    transform = transforms.Compose(
-        [
-            transforms.Resize(resolution),
-            transforms.ToTensor(),
-            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
     feat_extractor = dinov2
     if USE_CUDA:
         feat_extractor = feat_extractor.cuda()
     outputs = []
-    for i, image in enumerate(images):
-        torch_image = transform(image)
-        if USE_CUDA:
-            torch_image = torch_image.cuda()
         attn_output, mlp_output, block_output = feat_extractor(
-            torch_image.unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
@@ -443,33 +425,20 @@ class CLIP(torch.nn.Module):
 clip = CLIP()
 def image_clip_feature(
-    images, resolution=(224, 224), node_type="block", layer=-1
 ):
-    if isinstance(images, list):
-        assert isinstance(images[0], Image.Image), "Input must be a list of PIL images."
-    else:
-        assert isinstance(images, Image.Image), "Input must be a PIL image."
-        images = [images]
-    transform = transforms.Compose(
-        [
-            transforms.Resize(resolution),
-            transforms.ToTensor(),
-            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
     feat_extractor = clip
     if USE_CUDA:
         feat_extractor = feat_extractor.cuda()
     outputs = []
-    for i, image in enumerate(images):
-        torch_image = transform(image)
-        if USE_CUDA:
-            torch_image = torch_image.cuda()
         attn_output, mlp_output, block_output = feat_extractor(
-            torch_image.unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
@@ -527,27 +496,35 @@ def compute_hash(*args, **kwargs):
 @spaces.GPU(duration=30)
-def run_model_on_image(image, model_name="sam", node_type="block", layer=-1):
     global USE_CUDA
     USE_CUDA = True
     if model_name == "SAM(sam_vit_b)":
         if not USE_CUDA:
             gr.warning("GPU not detected. Running SAM on CPU, ~30s/image.")
-        result = image_sam_feature([image], node_type=node_type, layer=layer)
     elif model_name == 'MobileSAM':
-        result = image_mobilesam_feature([image], node_type=node_type, layer=layer)
     elif model_name == "DiNO(dinov2_vitb14_reg)":
-        result = image_dino_feature([image], node_type=node_type, layer=layer)
     elif model_name == "CLIP(openai/clip-vit-base-patch16)":
-        result = image_clip_feature([image], node_type=node_type, layer=layer)
     else:
         raise ValueError(f"Model {model_name} not supported.")
     USE_CUDA = False
     return result
-def extract_features(images, model_name="sam", node_type="block", layer=-1):
     # Compute the cache key
     cache_key = compute_hash(images, model_name, node_type, layer)
@@ -556,7 +533,7 @@ def extract_features(images, model_name="sam", node_type="block", layer=-1):
         print("Cache hit!")
         return cache[cache_key]
-    result = run_model_on_image(images[0], model_name=model_name, node_type=node_type, layer=layer)
     # Store the result in the cache
     cache[cache_key] = result

 import torch
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
 import numpy as np
 import os
 print("CUDA is available:", USE_CUDA)
+def transform_images(images, resolution=(1024, 1024)):
+    images = [image.convert("RGB").resize(resolution) for image in images]
+    # Convert to torch tensor
+    images = [torch.tensor(np.array(image).transpose(2, 0, 1)).float() / 255 for image in images]
+    # Normalize
+    images = [(image - 0.5) / 0.5 for image in images]
+    images = torch.stack(images)
+    return images
 class MobileSAM(nn.Module):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 def image_mobilesam_feature(
     images,
     node_type="block",
     layer=-1,
 ):
+    if USE_CUDA:
+        images = images.cuda()
     feat_extractor = mobilesam
     if USE_CUDA:
     # attn_outputs, mlp_outputs, block_outputs = [], [], []
     outputs = []
+    for i in range(images.shape[0]):
         attn_output, mlp_output, block_output = feat_extractor(
+            images[i].unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
 def image_sam_feature(
     images,
     node_type="block",
     layer=-1,
 ):
+    if USE_CUDA:
+        images = images.cuda()
     feat_extractor = sam
     if USE_CUDA:
     # attn_outputs, mlp_outputs, block_outputs = [], [], []
     outputs = []
+    for i in range(images.shape[0]):
         attn_output, mlp_output, block_output = feat_extractor(
+            images[i].unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
 dinov2 = DiNOv2()
+def image_dino_feature(images, node_type="block", layer=-1):
+    if USE_CUDA:
+        images = images.cuda()
     feat_extractor = dinov2
     if USE_CUDA:
         feat_extractor = feat_extractor.cuda()
+    # attn_outputs, mlp_outputs, block_outputs = [], [], []
     outputs = []
+    for i in range(images.shape[0]):
         attn_output, mlp_output, block_output = feat_extractor(
+            images[i].unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
 clip = CLIP()
 def image_clip_feature(
+    images, node_type="block", layer=-1
 ):
+    if USE_CUDA:
+        images = images.cuda()
     feat_extractor = clip
     if USE_CUDA:
         feat_extractor = feat_extractor.cuda()
+    # attn_outputs, mlp_outputs, block_outputs = [], [], []
     outputs = []
+    for i in range(images.shape[0]):
         attn_output, mlp_output, block_output = feat_extractor(
+            images[i].unsqueeze(0)
         )
         out_dict = {
             "attn": attn_output,
 @spaces.GPU(duration=30)
+def run_model_on_image(images, model_name="sam", node_type="block", layer=-1):
     global USE_CUDA
     USE_CUDA = True
     if model_name == "SAM(sam_vit_b)":
         if not USE_CUDA:
             gr.warning("GPU not detected. Running SAM on CPU, ~30s/image.")
+        result = image_sam_feature(images, node_type=node_type, layer=layer)
     elif model_name == 'MobileSAM':
+        result = image_mobilesam_feature(images, node_type=node_type, layer=layer)
     elif model_name == "DiNO(dinov2_vitb14_reg)":
+        result = image_dino_feature(images, node_type=node_type, layer=layer)
     elif model_name == "CLIP(openai/clip-vit-base-patch16)":
+        result = image_clip_feature(images, node_type=node_type, layer=layer)
     else:
         raise ValueError(f"Model {model_name} not supported.")
     USE_CUDA = False
     return result
+def extract_features(images, model_name="mobilesam", node_type="block", layer=-1):
+    resolution_dict = {
+        "mobilesam": (1024, 1024),
+        "sam(sam_vit_b)": (1024, 1024),
+        "dinov2(dinov2_vitb14_reg)": (448, 448),
+        "clip(openai/clip-vit-base-patch16)": (224, 224),
+    }
+    images = transform_images(images, resolution=resolution_dict[model_name])
     # Compute the cache key
     cache_key = compute_hash(images, model_name, node_type, layer)
         print("Cache hit!")
         return cache[cache_key]
+    result = run_model_on_image(images, model_name=model_name, node_type=node_type, layer=layer)
     # Store the result in the cache
     cache[cache_key] = result