joselobenitezg commited on
Commit
2e49a94
·
1 Parent(s): 607956f
Files changed (2) hide show
  1. app.py +13 -2
  2. inference/depth.py +211 -0
app.py CHANGED
@@ -7,13 +7,13 @@ import spaces
7
 
8
  from inference.seg import process_image_or_video as process_seg
9
  from inference.pose import process_image_or_video as process_pose
 
10
  from config import SAPIENS_LITE_MODELS_PATH
11
 
12
  def update_model_choices(task):
13
  model_choices = list(SAPIENS_LITE_MODELS_PATH[task.lower()].keys())
14
  return gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None)
15
 
16
- @spaces.GPU(duration=12)
17
  def process_image(input_image, task, version):
18
  if isinstance(input_image, np.ndarray):
19
  input_image = Image.fromarray(input_image)
@@ -22,6 +22,8 @@ def process_image(input_image, task, version):
22
  result = process_seg(input_image, task=task.lower(), version=version)
23
  elif task.lower() == 'pose':
24
  result = process_pose(input_image, task=task.lower(), version=version)
 
 
25
  else:
26
  result = None
27
  print(f"Tarea no soportada: {task}")
@@ -42,7 +44,16 @@ def process_video(input_video, task, version):
42
  break
43
 
44
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45
- processed_frame = process_image_or_video(frame_rgb, task=task.lower(), version=version)
 
 
 
 
 
 
 
 
 
46
 
47
  if processed_frame is not None:
48
  processed_frame_bgr = cv2.cvtColor(np.array(processed_frame), cv2.COLOR_RGB2BGR)
 
7
 
8
  from inference.seg import process_image_or_video as process_seg
9
  from inference.pose import process_image_or_video as process_pose
10
+ from inference.depth import process_image_or_video as process_depth
11
  from config import SAPIENS_LITE_MODELS_PATH
12
 
13
  def update_model_choices(task):
14
  model_choices = list(SAPIENS_LITE_MODELS_PATH[task.lower()].keys())
15
  return gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None)
16
 
 
17
  def process_image(input_image, task, version):
18
  if isinstance(input_image, np.ndarray):
19
  input_image = Image.fromarray(input_image)
 
22
  result = process_seg(input_image, task=task.lower(), version=version)
23
  elif task.lower() == 'pose':
24
  result = process_pose(input_image, task=task.lower(), version=version)
25
+ elif task.lower() == 'depth':
26
+ result = process_depth(input_image, task=task.lower(), version=version)
27
  else:
28
  result = None
29
  print(f"Tarea no soportada: {task}")
 
44
  break
45
 
46
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
47
+ if task.lower() == 'seg':
48
+ processed_frame = process_seg(frame_rgb, task=task.lower(), version=version)
49
+ elif task.lower() == 'pose':
50
+ processed_frame = process_pose(frame_rgb, task=task.lower(), version=version)
51
+ elif task.lower() == 'depth':
52
+ processed_frame = process_depth(frame_rgb, task=task.lower(), version=version)
53
+ else:
54
+ processed_frame = None
55
+ print(f"Tarea no soportada: {task}")
56
+ break
57
 
58
  if processed_frame is not None:
59
  processed_frame_bgr = cv2.cvtColor(np.array(processed_frame), cv2.COLOR_RGB2BGR)
inference/depth.py CHANGED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # Example usage
2
+ # import torch
3
+ # import numpy as np
4
+ # from PIL import Image
5
+ # from torchvision import transforms
6
+ # from config import LABELS_TO_IDS
7
+ # from utils.vis_utils import visualize_mask_with_overlay
8
+
9
+ # import torch
10
+ # import torch.nn.functional as F
11
+ # import numpy as np
12
+ # import cv2
13
+
14
+ # TASK = 'depth'
15
+ # VERSION = 'sapiens_0.3b'
16
+
17
+ # model_path = get_model_path(TASK, VERSION)
18
+ # print(model_path)
19
+
20
+ # model = torch.jit.load(model_path)
21
+ # model.eval()
22
+ # model.to("cuda")
23
+
24
+
25
+ # def get_depth(image, depth_model, input_shape=(3, 1024, 768), device="cuda"):
26
+ # # Preprocess the image
27
+ # img = preprocess_image(image, input_shape)
28
+
29
+ # # Run the model
30
+ # with torch.no_grad():
31
+ # result = depth_model(img.to(device))
32
+
33
+ # # Post-process the output
34
+ # depth_map = post_process_depth(result, (image.shape[0], image.shape[1]))
35
+
36
+ # # Visualize the depth map
37
+ # depth_image = visualize_depth(depth_map)
38
+
39
+ # return depth_image, depth_map
40
+
41
+ # def preprocess_image(image, input_shape):
42
+ # img = cv2.resize(image, (input_shape[2], input_shape[1]), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1)
43
+ # img = torch.from_numpy(img)
44
+ # img = img[[2, 1, 0], ...].float()
45
+ # mean = torch.tensor([123.5, 116.5, 103.5]).view(-1, 1, 1)
46
+ # std = torch.tensor([58.5, 57.0, 57.5]).view(-1, 1, 1)
47
+ # img = (img - mean) / std
48
+ # return img.unsqueeze(0)
49
+
50
+ # def post_process_depth(result, original_shape):
51
+ # # Check the dimensionality of the result
52
+ # if result.dim() == 3:
53
+ # result = result.unsqueeze(0)
54
+ # elif result.dim() == 4:
55
+ # pass
56
+ # else:
57
+ # raise ValueError(f"Unexpected result dimension: {result.dim()}")
58
+
59
+ # # Ensure we're interpolating to the correct dimensions
60
+ # seg_logits = F.interpolate(result, size=original_shape, mode="bilinear", align_corners=False).squeeze(0)
61
+ # depth_map = seg_logits.data.float().cpu().numpy()
62
+
63
+ # # If depth_map has an extra dimension, squeeze it
64
+ # if depth_map.ndim == 3 and depth_map.shape[0] == 1:
65
+ # depth_map = depth_map.squeeze(0)
66
+
67
+ # return depth_map
68
+
69
+ # def visualize_depth(depth_map):
70
+ # # Normalize the depth map
71
+ # min_val, max_val = np.nanmin(depth_map), np.nanmax(depth_map)
72
+ # depth_normalized = 1 - ((depth_map - min_val) / (max_val - min_val))
73
+
74
+ # # Convert to uint8
75
+ # depth_normalized = (depth_normalized * 255).astype(np.uint8)
76
+
77
+ # # Apply colormap
78
+ # depth_colored = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_INFERNO)
79
+
80
+ # return depth_colored
81
+
82
+ # # You can add the surface normal calculation if needed
83
+ # def calculate_surface_normal(depth_map):
84
+ # kernel_size = 7
85
+ # grad_x = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 1, 0, ksize=kernel_size)
86
+ # grad_y = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 0, 1, ksize=kernel_size)
87
+ # z = np.full(grad_x.shape, -1)
88
+ # normals = np.dstack((-grad_x, -grad_y, z))
89
+
90
+ # normals_mag = np.linalg.norm(normals, axis=2, keepdims=True)
91
+ # with np.errstate(divide="ignore", invalid="ignore"):
92
+ # normals_normalized = normals / (normals_mag + 1e-5)
93
+
94
+ # normals_normalized = np.nan_to_num(normals_normalized, nan=-1, posinf=-1, neginf=-1)
95
+ # normal_from_depth = ((normals_normalized + 1) / 2 * 255).astype(np.uint8)
96
+ # normal_from_depth = normal_from_depth[:, :, ::-1] # RGB to BGR for cv2
97
+
98
+ # return normal_from_depth
99
+
100
+ # from utils.vis_utils import resize_image
101
+
102
+ # pil_image = Image.open('/home/user/app/assets/image.webp')
103
+
104
+ # # Load and process an image
105
+ # image = cv2.imread('/home/user/app/assets/frame.png')
106
+ # depth_image, depth_map = get_depth(image, model)
107
+
108
+ # surface_normal = calculate_surface_normal(depth_map)
109
+ # cv2.imwrite("output_surface_normal.jpg", surface_normal)
110
+ # # Save the results
111
+ # output_im = cv2.imwrite("output_depth_image2.jpg", depth_image)
112
+
113
+ import torch
114
+ import torch.nn.functional as F
115
+ import numpy as np
116
+ import cv2
117
+ from PIL import Image
118
+ from config import SAPIENS_LITE_MODELS_PATH
119
+
120
+ def load_model(task, version):
121
+ try:
122
+ model_path = SAPIENS_LITE_MODELS_PATH[task][version]
123
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
124
+ model = torch.jit.load(model_path)
125
+ model.eval()
126
+ model.to(device)
127
+ return model, device
128
+ except KeyError as e:
129
+ print(f"Error: Tarea o versión inválida. {e}")
130
+ return None, None
131
+
132
+ def preprocess_image(image, input_shape):
133
+ img = cv2.resize(image, (input_shape[2], input_shape[1]), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1)
134
+ img = torch.from_numpy(img)
135
+ img = img[[2, 1, 0], ...].float()
136
+ mean = torch.tensor([123.5, 116.5, 103.5]).view(-1, 1, 1)
137
+ std = torch.tensor([58.5, 57.0, 57.5]).view(-1, 1, 1)
138
+ img = (img - mean) / std
139
+ return img.unsqueeze(0)
140
+
141
+ def post_process_depth(result, original_shape):
142
+ if result.dim() == 3:
143
+ result = result.unsqueeze(0)
144
+ elif result.dim() == 4:
145
+ pass
146
+ else:
147
+ raise ValueError(f"Unexpected result dimension: {result.dim()}")
148
+
149
+ seg_logits = F.interpolate(result, size=original_shape, mode="bilinear", align_corners=False).squeeze(0)
150
+ depth_map = seg_logits.data.float().cpu().numpy()
151
+
152
+ if depth_map.ndim == 3 and depth_map.shape[0] == 1:
153
+ depth_map = depth_map.squeeze(0)
154
+
155
+ return depth_map
156
+
157
+ def visualize_depth(depth_map):
158
+ min_val, max_val = np.nanmin(depth_map), np.nanmax(depth_map)
159
+ depth_normalized = 1 - ((depth_map - min_val) / (max_val - min_val))
160
+ depth_normalized = (depth_normalized * 255).astype(np.uint8)
161
+ depth_colored = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_INFERNO)
162
+ return depth_colored
163
+
164
+ def calculate_surface_normal(depth_map):
165
+ kernel_size = 7
166
+ grad_x = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 1, 0, ksize=kernel_size)
167
+ grad_y = cv2.Sobel(depth_map.astype(np.float32), cv2.CV_32F, 0, 1, ksize=kernel_size)
168
+ z = np.full(grad_x.shape, -1)
169
+ normals = np.dstack((-grad_x, -grad_y, z))
170
+
171
+ normals_mag = np.linalg.norm(normals, axis=2, keepdims=True)
172
+ with np.errstate(divide="ignore", invalid="ignore"):
173
+ normals_normalized = normals / (normals_mag + 1e-5)
174
+
175
+ normals_normalized = np.nan_to_num(normals_normalized, nan=-1, posinf=-1, neginf=-1)
176
+ normal_from_depth = ((normals_normalized + 1) / 2 * 255).astype(np.uint8)
177
+ normal_from_depth = normal_from_depth[:, :, ::-1] # RGB to BGR for cv2
178
+
179
+ return normal_from_depth
180
+
181
+ def process_image_or_video(input_data, task='depth', version='sapiens_0.3b'):
182
+ model, device = load_model(task, version)
183
+ if model is None or device is None:
184
+ return None
185
+
186
+ input_shape = (3, 1024, 768)
187
+
188
+ def process_frame(frame):
189
+ if isinstance(frame, Image.Image):
190
+ frame = np.array(frame)
191
+
192
+ if frame.shape[2] == 4: # RGBA
193
+ frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
194
+
195
+ img = preprocess_image(frame, input_shape)
196
+
197
+ with torch.no_grad():
198
+ result = model(img.to(device))
199
+
200
+ depth_map = post_process_depth(result, (frame.shape[0], frame.shape[1]))
201
+ depth_image = visualize_depth(depth_map)
202
+
203
+ return Image.fromarray(cv2.cvtColor(depth_image, cv2.COLOR_BGR2RGB))
204
+
205
+ if isinstance(input_data, np.ndarray): # Video frame
206
+ return process_frame(input_data)
207
+ elif isinstance(input_data, Image.Image): # Imagen
208
+ return process_frame(input_data)
209
+ else:
210
+ print("Tipo de entrada no soportado. Por favor, proporcione una imagen PIL o un frame de video numpy.")
211
+ return None