Spaces:

joselobenitezg
/

sapiens-demo

Build error

App Files Files Community

joselobenitezg commited on Aug 27, 2024

Commit

2e49a94

1 Parent(s): 607956f

add depth

Browse files

Files changed (2) hide show

app.py +13 -2
inference/depth.py +211 -0

app.py CHANGED Viewed

@@ -7,13 +7,13 @@ import spaces
 from inference.seg import process_image_or_video as process_seg
 from inference.pose import process_image_or_video as process_pose
 from config import SAPIENS_LITE_MODELS_PATH
 def update_model_choices(task):
     model_choices = list(SAPIENS_LITE_MODELS_PATH[task.lower()].keys())
     return gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None)
-@spaces.GPU(duration=12)
 def process_image(input_image, task, version):
     if isinstance(input_image, np.ndarray):
         input_image = Image.fromarray(input_image)
@@ -22,6 +22,8 @@ def process_image(input_image, task, version):
         result = process_seg(input_image, task=task.lower(), version=version)
     elif task.lower() == 'pose':
         result = process_pose(input_image, task=task.lower(), version=version)
     else:
         result = None
         print(f"Tarea no soportada: {task}")
@@ -42,7 +44,16 @@ def process_video(input_video, task, version):
             break
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        processed_frame = process_image_or_video(frame_rgb, task=task.lower(), version=version)
         if processed_frame is not None:
             processed_frame_bgr = cv2.cvtColor(np.array(processed_frame), cv2.COLOR_RGB2BGR)

 from inference.seg import process_image_or_video as process_seg
 from inference.pose import process_image_or_video as process_pose
+from inference.depth import process_image_or_video as process_depth
 from config import SAPIENS_LITE_MODELS_PATH
 def update_model_choices(task):
     model_choices = list(SAPIENS_LITE_MODELS_PATH[task.lower()].keys())
     return gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None)
 def process_image(input_image, task, version):
     if isinstance(input_image, np.ndarray):
         input_image = Image.fromarray(input_image)
         result = process_seg(input_image, task=task.lower(), version=version)
     elif task.lower() == 'pose':
         result = process_pose(input_image, task=task.lower(), version=version)
+    elif task.lower() == 'depth':
+        result = process_depth(input_image, task=task.lower(), version=version)
     else:
         result = None
         print(f"Tarea no soportada: {task}")
             break
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if task.lower() == 'seg':
+            processed_frame = process_seg(frame_rgb, task=task.lower(), version=version)
+        elif task.lower() == 'pose':
+            processed_frame = process_pose(frame_rgb, task=task.lower(), version=version)
+        elif task.lower() == 'depth':
+            processed_frame = process_depth(frame_rgb, task=task.lower(), version=version)
+        else:
+            processed_frame = None
+            print(f"Tarea no soportada: {task}")
+            break
         if processed_frame is not None:
             processed_frame_bgr = cv2.cvtColor(np.array(processed_frame), cv2.COLOR_RGB2BGR)

inference/depth.py CHANGED Viewed

	@@ -0,0 +1,211 @@

+# # Example usage
+# import torch
+# import numpy as np
+# from PIL import Image
+# from torchvision import transforms
+# from config import LABELS_TO_IDS
+# from utils.vis_utils import visualize_mask_with_overlay
+# import torch
+# import torch.nn.functional as F
+# import numpy as np
+# import cv2
+# TASK = 'depth'
+# VERSION = 'sapiens_0.3b'
+# model_path = get_model_path(TASK, VERSION)
+# print(model_path)
+# model = torch.jit.load(model_path)
+# model.eval()
+# model.to("cuda")
+# def get_depth(image, depth_model, input_shape=(3, 1024, 768), device="cuda"):
+#     # Preprocess the image
+#     img = preprocess_image(image, input_shape)
+#     # Run the model
+#     with torch.no_grad():
+#         result = depth_model(img.to(device))
+#     # Post-process the output
+#     depth_map = post_process_depth(result, (image.shape[0], image.shape[1]))
+#     # Visualize the depth map
+#     depth_image = visualize_depth(depth_map)
+#     return depth_image, depth_map
+# def preprocess_image(image, input_shape):
+#     img = cv2.resize(image, (input_shape[2], input_shape[1]), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1)
+#     img = torch.from_numpy(img)
+#     img = img[[2, 1, 0], ...].float()
+#     mean = torch.tensor([123.5, 116.5, 103.5]).view(-1, 1, 1)
+#     std = torch.tensor([58.5, 57.0, 57.5]).view(-1, 1, 1)
+#     img = (img - mean) / std
+#     return img.unsqueeze(0)
+# def post_process_depth(result, original_shape):
+#     # Check the dimensionality of the result
+#     if result.dim() == 3:
+#         result = result.unsqueeze(0)
+#     elif result.dim() == 4:
+#         pass
+#     else:
+#         raise ValueError(f"Unexpected result dimension: {result.dim()}")
+#     # Ensure we're interpolating to the correct dimensions
+#     seg_logits = F.interpolate(result, size=original_shape, mode="bilinear", align_corners=False).squeeze(0)
+#     depth_map = seg_logits.data.float().cpu().numpy()
+#     # If depth_map has an extra dimension, squeeze it
+#     if depth_map.ndim == 3 and depth_map.shape[0] == 1:
+#         depth_map = depth_map.squeeze(0)
+#     return depth_map
+# def visualize_depth(depth_map):
+#     # Normalize the depth map
+#     min_val, max_val = np.nanmin(depth_map), np.nanmax(depth_map)
+#     depth_normalized = 1 - ((depth_map - min_val) / (max_val - min_val))
+#     # Convert to uint8
+#     depth_normalized = (depth_normalized * 255).astype(np.uint8)
+#     # Apply colormap
+#     depth_colored = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_INFERNO)
+#     return depth_colored
+# # You can add the surface normal calculation if needed
+# def calculate_surface_normal(depth_map):
+#     kernel_size = 7
+#     grad_x = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 1, 0, ksize=kernel_size)
+#     grad_y = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 0, 1, ksize=kernel_size)
+#     z = np.full(grad_x.shape, -1)
+#     normals = np.dstack((-grad_x, -grad_y, z))
+#     normals_mag = np.linalg.norm(normals, axis=2, keepdims=True)
+#     with np.errstate(divide="ignore", invalid="ignore"):
+#         normals_normalized = normals / (normals_mag + 1e-5)
+#     normals_normalized = np.nan_to_num(normals_normalized, nan=-1, posinf=-1, neginf=-1)
+#     normal_from_depth = ((normals_normalized + 1) / 2 * 255).astype(np.uint8)
+#     normal_from_depth = normal_from_depth[:, :, ::-1]  # RGB to BGR for cv2
+#     return normal_from_depth
+# from utils.vis_utils import resize_image
+# pil_image = Image.open('/home/user/app/assets/image.webp')
+# # Load and process an image
+# image = cv2.imread('/home/user/app/assets/frame.png')
+# depth_image, depth_map = get_depth(image, model)
+# surface_normal = calculate_surface_normal(depth_map)
+# cv2.imwrite("output_surface_normal.jpg", surface_normal)
+# # Save the results
+# output_im = cv2.imwrite("output_depth_image2.jpg", depth_image)
+import torch
+import torch.nn.functional as F
+import numpy as np
+import cv2
+from PIL import Image
+from config import SAPIENS_LITE_MODELS_PATH
+def load_model(task, version):
+    try:
+        model_path = SAPIENS_LITE_MODELS_PATH[task][version]
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = torch.jit.load(model_path)
+        model.eval()
+        model.to(device)
+        return model, device
+    except KeyError as e:
+        print(f"Error: Tarea o versión inválida. {e}")
+        return None, None
+def preprocess_image(image, input_shape):
+    img = cv2.resize(image, (input_shape[2], input_shape[1]), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1)
+    img = torch.from_numpy(img)
+    img = img[[2, 1, 0], ...].float()
+    mean = torch.tensor([123.5, 116.5, 103.5]).view(-1, 1, 1)
+    std = torch.tensor([58.5, 57.0, 57.5]).view(-1, 1, 1)
+    img = (img - mean) / std
+    return img.unsqueeze(0)
+def post_process_depth(result, original_shape):
+    if result.dim() == 3:
+        result = result.unsqueeze(0)
+    elif result.dim() == 4:
+        pass
+    else:
+        raise ValueError(f"Unexpected result dimension: {result.dim()}")
+    seg_logits = F.interpolate(result, size=original_shape, mode="bilinear", align_corners=False).squeeze(0)
+    depth_map = seg_logits.data.float().cpu().numpy()
+    if depth_map.ndim == 3 and depth_map.shape[0] == 1:
+        depth_map = depth_map.squeeze(0)
+    return depth_map
+def visualize_depth(depth_map):
+    min_val, max_val = np.nanmin(depth_map), np.nanmax(depth_map)
+    depth_normalized = 1 - ((depth_map - min_val) / (max_val - min_val))
+    depth_normalized = (depth_normalized * 255).astype(np.uint8)
+    depth_colored = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_INFERNO)
+    return depth_colored
+def calculate_surface_normal(depth_map):
+    kernel_size = 7
+    grad_x = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 1, 0, ksize=kernel_size)
+    grad_y = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 0, 1, ksize=kernel_size)
+    z = np.full(grad_x.shape, -1)
+    normals = np.dstack((-grad_x, -grad_y, z))
+    normals_mag = np.linalg.norm(normals, axis=2, keepdims=True)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        normals_normalized = normals / (normals_mag + 1e-5)
+    normals_normalized = np.nan_to_num(normals_normalized, nan=-1, posinf=-1, neginf=-1)
+    normal_from_depth = ((normals_normalized + 1) / 2 * 255).astype(np.uint8)
+    normal_from_depth = normal_from_depth[:, :, ::-1]  # RGB to BGR for cv2
+    return normal_from_depth
+def process_image_or_video(input_data, task='depth', version='sapiens_0.3b'):
+    model, device = load_model(task, version)
+    if model is None or device is None:
+        return None
+    input_shape = (3, 1024, 768)
+    def process_frame(frame):
+        if isinstance(frame, Image.Image):
+            frame = np.array(frame)
+        if frame.shape[2] == 4:  # RGBA
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
+        img = preprocess_image(frame, input_shape)
+        with torch.no_grad():
+            result = model(img.to(device))
+        depth_map = post_process_depth(result, (frame.shape[0], frame.shape[1]))
+        depth_image = visualize_depth(depth_map)
+        return Image.fromarray(cv2.cvtColor(depth_image, cv2.COLOR_BGR2RGB))
+    if isinstance(input_data, np.ndarray):  # Video frame
+        return process_frame(input_data)
+    elif isinstance(input_data, Image.Image):  # Imagen
+        return process_frame(input_data)
+    else:
+        print("Tipo de entrada no soportado. Por favor, proporcione una imagen PIL o un frame de video numpy.")
+        return None