Zhenyu Li commited on
Commit
51946d3
·
1 Parent(s): d2c69d3
Files changed (4) hide show
  1. app.py +42 -5
  2. examples/1_depth.png +2 -2
  3. examples/2_depth.png +2 -2
  4. examples/4_depth.png +2 -2
app.py CHANGED
@@ -23,6 +23,7 @@
23
  # File author: Zhenyu Li
24
 
25
  import gc
 
26
  from ControlNet.share import *
27
  import einops
28
  import torch
@@ -48,6 +49,9 @@ import torch.nn.functional as F
48
  from huggingface_hub import hf_hub_download
49
  import matplotlib
50
 
 
 
 
51
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
52
 
53
  def depth_load_state_dict(model, state_dict):
@@ -107,6 +111,29 @@ model.load_state_dict(load_state_dict(controlnet_ckp, location=DEVICE), strict=F
107
  model = model.to(DEVICE)
108
  ddim_sampler = DDIMSampler(model)
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def colorize_depth_maps(depth_map, min_depth=0, max_depth=0, cmap='Spectral_r', valid_mask=None):
111
  """
112
  Colorize depth maps.
@@ -176,13 +203,18 @@ def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resoluti
176
 
177
  depth_model.to(DEVICE)
178
  detected_map = predict_depth(depth_model, input_image, mode, patch_number, resolution, patch_size, device=DEVICE)
 
 
 
 
179
 
180
  depth_model.cpu() # free some mem
181
  gc.collect()
182
  torch.cuda.empty_cache()
183
 
 
 
184
  detected_map = F.interpolate(torch.from_numpy(detected_map).unsqueeze(dim=0).unsqueeze(dim=0), (image_resolution, image_resolution), mode='bicubic', align_corners=True).squeeze().numpy()
185
- colored_depth = colorize_depth_maps(detected_map) * 255
186
 
187
  H, W = detected_map.shape
188
  detected_map_temp = ((1 - detected_map / (np.max(detected_map + 1e-3))) * 255)
@@ -235,6 +267,7 @@ def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resoluti
235
  # t_r = F.interpolate(t_r, (h, w), mode='bicubic', align_corners=True).squeeze().permute(1, 2, 0).numpy().astype(np.uint8)
236
  t_r = t_r.squeeze().permute(1, 2, 0).numpy().astype(np.uint8)
237
  update_return_list.append(t_r)
 
238
 
239
  return update_return_list
240
 
@@ -249,9 +282,9 @@ Please refer to our [project webpage](https://zhyever.github.io/patchfusion), [p
249
 
250
  The overall pipeline: image --> (PatchFusion) --> depth --> (controlnet) --> generated image.
251
 
252
- As for the PatchFusion, it works on default 4k (2160x3840) resolution. All input images will be resized to 4k before passing through PatchFusion as default. It means if you have a higher resolution image, you might want to increase the processing resolution in the advanced option (You would also change the patch size to 1/4 image resolution). Because of the tiling strategy, our PatchFusion would not use more memory or time for even higher resolution inputs if properly setting parameters.
253
 
254
- For ControlNet, it works on default 896x896 resolution. Again, all input images will be resized to 896x896 before passing through ControlNet as default. You might be not happy because the 4K->896x896 downsampling, but limited by the GPU resource, this demo could only achieve this. This is the memory bottleneck. The output is not resized back to the image resolution for fast inference (Well... It's still so slow now... :D)
255
 
256
  We provide some tips might be helpful: (1) Try our experimental demo (see our project website) running on a local 80G gpu (you could try high-resolution generation there, like the one in our paper). But of course, it would be expired soon (in two days maybe); (2) Clone our code repo, and look for a gpu with more than 24G memory; (3) Clone our code repo, run the depth estimation (there are another demos for depth estimation and image-to-3D), and search for another guided high-resolution image generation strategy; (4) Some kind people give this space a stronger gpu support.
257
  """
@@ -269,9 +302,13 @@ with gr.Blocks() as demo:
269
  prompt = gr.Textbox(label="Prompt (input your description)", value='A cozy cottage in an oil painting, with rich textures and vibrant green foliage')
270
  run_button = gr.Button("Run")
271
 
272
- depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
273
  generated_image = gr.Image(label="Generated Map", elem_id='img-display-output')
274
 
 
 
 
 
 
275
  with gr.Row():
276
  with gr.Accordion("Advanced options", open=False):
277
  # mode = gr.Radio(["P49", "R"], label="Tiling mode", info="We recommand using P49 for fast evaluation and R with 1024 patches for best visualization results, respectively", elem_id='mode', value='R'),
@@ -293,7 +330,7 @@ with gr.Blocks() as demo:
293
  n_prompt = gr.Textbox(label="Negative prompt", value='worst quality, low quality, lose details')
294
 
295
  ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, mode[0], patch_number, resolution, patch_size]
296
- run_button.click(fn=process, inputs=ips, outputs=[depth_image, generated_image])
297
  examples = gr.Examples(
298
  inputs=[input_image, depth_image, generated_image],
299
  outputs=[input_image, depth_image, generated_image],
 
23
  # File author: Zhenyu Li
24
 
25
  import gc
26
+ import copy
27
  from ControlNet.share import *
28
  import einops
29
  import torch
 
49
  from huggingface_hub import hf_hub_download
50
  import matplotlib
51
 
52
+ from PIL import Image
53
+ import tempfile
54
+
55
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
56
 
57
  def depth_load_state_dict(model, state_dict):
 
111
  model = model.to(DEVICE)
112
  ddim_sampler = DDIMSampler(model)
113
 
114
+ def colorize(value, cmap='magma_r', vmin=None, vmax=None):
115
+
116
+ percentile = 0.03
117
+ vmin = np.percentile(value, percentile)
118
+ vmax = np.percentile(value, 100 - percentile)
119
+
120
+ if vmin != vmax:
121
+ value = (value - vmin) / (vmax - vmin) # vmin..vmax
122
+ else:
123
+ value = value * 0.
124
+
125
+ cmapper = matplotlib.cm.get_cmap(cmap)
126
+ value = cmapper(value, bytes=True) # ((1)xhxwx4)
127
+
128
+ value = value[:, :, :3] # bgr -> rgb
129
+ # rgb_value = value[..., ::-1]
130
+ rgb_value = value
131
+
132
+ rgb_value = np.transpose(rgb_value, (2, 0, 1))
133
+ rgb_value = rgb_value[np.newaxis, ...]
134
+
135
+ return rgb_value
136
+
137
  def colorize_depth_maps(depth_map, min_depth=0, max_depth=0, cmap='Spectral_r', valid_mask=None):
138
  """
139
  Colorize depth maps.
 
203
 
204
  depth_model.to(DEVICE)
205
  detected_map = predict_depth(depth_model, input_image, mode, patch_number, resolution, patch_size, device=DEVICE)
206
+ detected_map_save = copy.deepcopy(detected_map)
207
+ tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
208
+ detected_map_save = Image.fromarray((detected_map_save*256).astype('uint16'))
209
+ detected_map_save.save(tmp.name)
210
 
211
  depth_model.cpu() # free some mem
212
  gc.collect()
213
  torch.cuda.empty_cache()
214
 
215
+ # colored_depth = colorize_depth_maps(detected_map) * 255
216
+ colored_depth = colorize(detected_map)
217
  detected_map = F.interpolate(torch.from_numpy(detected_map).unsqueeze(dim=0).unsqueeze(dim=0), (image_resolution, image_resolution), mode='bicubic', align_corners=True).squeeze().numpy()
 
218
 
219
  H, W = detected_map.shape
220
  detected_map_temp = ((1 - detected_map / (np.max(detected_map + 1e-3))) * 255)
 
267
  # t_r = F.interpolate(t_r, (h, w), mode='bicubic', align_corners=True).squeeze().permute(1, 2, 0).numpy().astype(np.uint8)
268
  t_r = t_r.squeeze().permute(1, 2, 0).numpy().astype(np.uint8)
269
  update_return_list.append(t_r)
270
+ update_return_list.append(tmp.name)
271
 
272
  return update_return_list
273
 
 
282
 
283
  The overall pipeline: image --> (PatchFusion) --> depth --> (controlnet) --> generated image.
284
 
285
+ As for the PatchFusion, it works on default 4k (2160x3840) resolution. All input images will be resized to 4k before passing through PatchFusion as default. It means if you have a higher resolution image, you might want to increase the processing resolution in the advanced option (You would also change the patch size to 1/4 image resolution). Because of the tiling strategy, our PatchFusion would not use more memory or time for even higher resolution inputs if properly setting parameters. The output depth map is resized to the original image resolution. Download for better visualization quality. 16-Bit Raw Depth = (pred_depth * 256).to(uint16).
286
 
287
+ For ControlNet, it works on default 896x896 resolution. Again, all input images will be resized to 896x896 before passing through ControlNet as default. You might be not happy because the 4K->896x896 downsampling, but limited by the GPU resource, this demo could only achieve this. This is the memory bottleneck. The output is not resized back to the image resolution for fast inference (Well... It's still so slow now... :D).
288
 
289
  We provide some tips might be helpful: (1) Try our experimental demo (see our project website) running on a local 80G gpu (you could try high-resolution generation there, like the one in our paper). But of course, it would be expired soon (in two days maybe); (2) Clone our code repo, and look for a gpu with more than 24G memory; (3) Clone our code repo, run the depth estimation (there are another demos for depth estimation and image-to-3D), and search for another guided high-resolution image generation strategy; (4) Some kind people give this space a stronger gpu support.
290
  """
 
302
  prompt = gr.Textbox(label="Prompt (input your description)", value='A cozy cottage in an oil painting, with rich textures and vibrant green foliage')
303
  run_button = gr.Button("Run")
304
 
 
305
  generated_image = gr.Image(label="Generated Map", elem_id='img-display-output')
306
 
307
+ with gr.Row():
308
+ depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
309
+ with gr.Row():
310
+ raw_file = gr.File(label="16-Bit Raw Depth, Multiplier:256")
311
+
312
  with gr.Row():
313
  with gr.Accordion("Advanced options", open=False):
314
  # mode = gr.Radio(["P49", "R"], label="Tiling mode", info="We recommand using P49 for fast evaluation and R with 1024 patches for best visualization results, respectively", elem_id='mode', value='R'),
 
330
  n_prompt = gr.Textbox(label="Negative prompt", value='worst quality, low quality, lose details')
331
 
332
  ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, mode[0], patch_number, resolution, patch_size]
333
+ run_button.click(fn=process, inputs=ips, outputs=[depth_image, generated_image, raw_file])
334
  examples = gr.Examples(
335
  inputs=[input_image, depth_image, generated_image],
336
  outputs=[input_image, depth_image, generated_image],
examples/1_depth.png CHANGED

Git LFS Details

  • SHA256: 7c794f57437e00843fa1ed2a6bfeed3de3912acc0d8cf83c87a51c064634dc8e
  • Pointer size: 131 Bytes
  • Size of remote file: 645 kB

Git LFS Details

  • SHA256: 754674d1092256c97e46cb031176885f3c18004231775727e968c063fe3dfd06
  • Pointer size: 132 Bytes
  • Size of remote file: 8.96 MB
examples/2_depth.png CHANGED

Git LFS Details

  • SHA256: cd9f7082899b33802313c7f3cffea00759381019d4962c6ba510444ced29d774
  • Pointer size: 131 Bytes
  • Size of remote file: 402 kB

Git LFS Details

  • SHA256: 5ce9252ecd51fda75cad472c5a5929ae8aacfa177545d045157fcdec0ee07c14
  • Pointer size: 132 Bytes
  • Size of remote file: 1.11 MB
examples/4_depth.png CHANGED

Git LFS Details

  • SHA256: 9e730e0d3efc53cce2e512e493855486f6b7e1b0b5bdf214cedb868b6ac9b68f
  • Pointer size: 131 Bytes
  • Size of remote file: 276 kB

Git LFS Details

  • SHA256: 0242a8ed7444750753c9b674ec9e5e2f33fd1af0c37e0378083b92186e00d464
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB