marigold-lcm

Running

toshas commited on Mar 30, 2024

Commit

4e4c4c8

1 Parent(s): 7d6afa0

fix ui not updating outputs when changing parameters

increase video processing pipeline duration
fix markdown center alignment
add reference to the original marigold demo
simplify the bas-relief updates via ux
point to the prs-eth org model checkpoints
fix reproducibility with seeding
add license headers

Files changed (5) hide show

README.md +7 -10
app.py +47 -44
extrude.py +20 -0
marigold_depth_estimation_lcm.py +10 -4
requirements.txt +5 -4

README.md CHANGED Viewed

@@ -4,26 +4,23 @@ emoji: 🏵️
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 4.23.0
 app_file: app.py
 pinned: true
 license: cc-by-sa-4.0
 models:
-- prs-eth/marigold-v1-0
 - prs-eth/marigold-lcm-v1-0
 ---
 This is a demo of Marigold-LCM, the state-of-the-art depth estimator for images in the wild.
 It combines the power of the original Marigold 10-step estimator and the Latent Consistency Models, delivering high-quality results in as little as one step.
-Find out more in our paper titled ["Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation"](https://arxiv.org/abs/2312.02145)
 ```
-@misc{ke2023repurposing,
-      title={Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation},
-      author={Bingxin Ke and Anton Obukhov and Shengyu Huang and Nando Metzger and Rodrigo Caye Daudt and Konrad Schindler},
-      year={2023},
-      eprint={2312.02145},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV}
 }
 ```

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 4.21.0
 app_file: app.py
 pinned: true
 license: cc-by-sa-4.0
 models:
 - prs-eth/marigold-lcm-v1-0
 ---
 This is a demo of Marigold-LCM, the state-of-the-art depth estimator for images in the wild.
 It combines the power of the original Marigold 10-step estimator and the Latent Consistency Models, delivering high-quality results in as little as one step.
+Find out more in our CVPR 2024 paper titled ["Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation"](https://arxiv.org/abs/2312.02145)
 ```
+@InProceedings{ke2023repurposing,
+  title={Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation},
+  author={Bingxin Ke and Anton Obukhov and Shengyu Huang and Nando Metzger and Rodrigo Caye Daudt and Konrad Schindler},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2024}
 }
 ```

app.py CHANGED Viewed

@@ -1,6 +1,26 @@
 import functools
 import os
-import shutil
 import zipfile
 from io import BytesIO
@@ -10,7 +30,6 @@ import imageio as imageio
 import numpy as np
 import torch as torch
 from PIL import Image
-from diffusers import UNet2DConditionModel, LCMScheduler
 from gradio_imageslider import ImageSlider
 from huggingface_hub import login
 from tqdm import tqdm
@@ -52,7 +71,6 @@ def process_image(
     denoise_steps=default_image_denoise_steps,
     ensemble_size=default_image_ensemble_size,
     processing_res=default_image_processing_res,
-    reproducible=default_image_reproducuble,
 ):
     input_image = Image.open(path_input)
@@ -62,7 +80,7 @@ def process_image(
         ensemble_size=ensemble_size,
         processing_res=processing_res,
         batch_size=1 if processing_res == 0 else 0,
-        seed=default_seed if reproducible else None,
         show_progress_bar=False,
     )
@@ -70,8 +88,7 @@ def process_image(
     depth_colored = pipe_out.depth_colored
     depth_16bit = (depth_pred * 65535.0).astype(np.uint16)
-    path_output_dir = os.path.splitext(path_input)[0] + "_output"
-    os.makedirs(path_output_dir, exist_ok=True)
     name_base = os.path.splitext(os.path.basename(path_input))[0]
     path_out_fp32 = os.path.join(path_output_dir, f"{name_base}_depth_fp32.npy")
@@ -99,8 +116,7 @@ def process_video(
     out_max_frames=default_video_out_max_frames,
     progress=gr.Progress(),
 ):
-    path_output_dir = os.path.splitext(path_input)[0] + "_output"
-    os.makedirs(path_output_dir, exist_ok=True)
     name_base = os.path.splitext(os.path.basename(path_input))[0]
     path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.mp4")
@@ -152,6 +168,7 @@ def process_video(
             batch_size=0,
             depth_latent_init=prev_depth_latent,
             depth_latent_init_strength=depth_latent_init_strength,
             seed=default_seed,
             show_progress_bar=False,
         )
@@ -204,8 +221,7 @@ def process_bas(
     if plane_near >= plane_far:
         raise gr.Error("NEAR plane must have a value smaller than the FAR plane")
-    path_output_dir = os.path.splitext(path_input)[0] + "_output"
-    os.makedirs(path_output_dir, exist_ok=True)
     name_base, name_ext = os.path.splitext(os.path.basename(path_input))
@@ -280,7 +296,7 @@ def process_bas(
 def run_demo_server(pipe):
     process_pipe_image = spaces.GPU(functools.partial(process_image, pipe))
-    process_pipe_video = spaces.GPU(functools.partial(process_video, pipe))
     process_pipe_bas = spaces.GPU(functools.partial(process_bas, pipe))
     os.environ["GRADIO_ALLOW_FLAGGING"] = "never"
@@ -304,6 +320,18 @@ def run_demo_server(pipe):
                 font-size: 20px !important;
                 color: crimson !important;
             }
         """,
         head="""
             <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
@@ -317,7 +345,7 @@ def run_demo_server(pipe):
     ) as demo:
         gr.Markdown(
             """
-            <h1 align="center">Marigold-LCM Depth Estimation</h1>
             <p align="center">
             <a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
                 <img src="https://www.obukhov.ai/img/badges/badge-website.svg">
@@ -336,9 +364,10 @@ def run_demo_server(pipe):
                 Marigold-LCM is the fast version of Marigold, the state-of-the-art depth estimator for images in the wild.
                 It combines the power of the original Marigold 10-step estimator and the Latent Consistency Models, delivering high-quality results in as little as <b>one step</b>.
                 We provide three functions in this demo: Image, Video, and Bas-relief 3D processing — <b>see the tabs below</b>.
-                Upload your content into the <b>left</b> side, or click any of the <b>examples</b> below.
-                Wait a second (for images and 3D) or a minute (for videos), and interact with the result in the <b>right</b> side.
                 To avoid queuing, fork the demo into your profile.
             </p>
         """
         )
@@ -474,8 +503,6 @@ def run_demo_server(pipe):
                     <p align="justify">
                         This part of the demo uses Marigold-LCM to create a bas-relief model.
                         The models are watertight, with correct normals, and exported in the STL format, which makes them <b>3D-printable</b>.
-                        Start by uploading the image and click "Create" with the default parameters.
-                        To improve the result, click "Clear", adjust the geometry sliders below, and click "Create" again.
                     </p>
                     """,
                 )
@@ -487,7 +514,6 @@ def run_demo_server(pipe):
                         )
                         with gr.Row():
                             bas_submit_btn = gr.Button(value="Create 3D", variant="primary")
-                            bas_clear_btn = gr.Button(value="Clear")
                             bas_reset_btn = gr.Button(value="Reset")
                         with gr.Accordion("3D printing demo: Main options", open=True):
                             bas_plane_near = gr.Slider(
@@ -703,13 +729,8 @@ def run_demo_server(pipe):
             concurrency_limit=1,
         )
-        def wrapper_process_pipe_bas(*args, **kwargs):
-            out = list(process_pipe_bas(*args, **kwargs))
-            out = [gr.Button(interactive=False), gr.Image(interactive=False)] + out
-            return out
         bas_submit_btn.click(
-            fn=wrapper_process_pipe_bas,
             inputs=[
                 bas_input,
                 bas_plane_near,
@@ -725,18 +746,7 @@ def run_demo_server(pipe):
                 bas_frame_near,
                 bas_frame_far,
             ],
-            outputs=[bas_submit_btn, bas_input, bas_output_viewer, bas_output_files],
-            concurrency_limit=1,
-        )
-        bas_clear_btn.click(
-            fn=lambda: (gr.Button(interactive=True), None, None),
-            inputs=[],
-            outputs=[
-                bas_submit_btn,
-                bas_output_viewer,
-                bas_output_files,
-            ],
             concurrency_limit=1,
         )
@@ -790,21 +800,14 @@ def run_demo_server(pipe):
 def main():
-    CHECKPOINT = "prs-eth/marigold-v1-0"
-    CHECKPOINT_UNET_LCM = "prs-eth/marigold-lcm-v1-0"
     if "HF_TOKEN_LOGIN" in os.environ:
         login(token=os.environ["HF_TOKEN_LOGIN"])
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    pipe = MarigoldDepthConsistencyPipeline.from_pretrained(
-        CHECKPOINT,
-        unet=UNet2DConditionModel.from_pretrained(
-            CHECKPOINT_UNET_LCM, subfolder="unet", use_auth_token=True
-        ),
-    )
-    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
     try:
         import xformers

+# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
 import functools
 import os
+import tempfile
 import zipfile
 from io import BytesIO
 import numpy as np
 import torch as torch
 from PIL import Image
 from gradio_imageslider import ImageSlider
 from huggingface_hub import login
 from tqdm import tqdm
     denoise_steps=default_image_denoise_steps,
     ensemble_size=default_image_ensemble_size,
     processing_res=default_image_processing_res,
 ):
     input_image = Image.open(path_input)
         ensemble_size=ensemble_size,
         processing_res=processing_res,
         batch_size=1 if processing_res == 0 else 0,
+        seed=default_seed,
         show_progress_bar=False,
     )
     depth_colored = pipe_out.depth_colored
     depth_16bit = (depth_pred * 65535.0).astype(np.uint16)
+    path_output_dir = tempfile.mkdtemp()
     name_base = os.path.splitext(os.path.basename(path_input))[0]
     path_out_fp32 = os.path.join(path_output_dir, f"{name_base}_depth_fp32.npy")
     out_max_frames=default_video_out_max_frames,
     progress=gr.Progress(),
 ):
+    path_output_dir = tempfile.mkdtemp()
     name_base = os.path.splitext(os.path.basename(path_input))[0]
     path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.mp4")
             batch_size=0,
             depth_latent_init=prev_depth_latent,
             depth_latent_init_strength=depth_latent_init_strength,
+            return_depth_latent=True,
             seed=default_seed,
             show_progress_bar=False,
         )
     if plane_near >= plane_far:
         raise gr.Error("NEAR plane must have a value smaller than the FAR plane")
+    path_output_dir = tempfile.mkdtemp()
     name_base, name_ext = os.path.splitext(os.path.basename(path_input))
 def run_demo_server(pipe):
     process_pipe_image = spaces.GPU(functools.partial(process_image, pipe))
+    process_pipe_video = spaces.GPU(functools.partial(process_video, pipe), duration=120)
     process_pipe_bas = spaces.GPU(functools.partial(process_bas, pipe))
     os.environ["GRADIO_ALLOW_FLAGGING"] = "never"
                 font-size: 20px !important;
                 color: crimson !important;
             }
+            h1 {
+                text-align: center;
+                display: block;
+            }
+            h2 {
+                text-align: center;
+                display: block;
+            }
+            h3 {
+                text-align: center;
+                display: block;
+            }
         """,
         head="""
             <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
     ) as demo:
         gr.Markdown(
             """
+            # Marigold-LCM Depth Estimation
             <p align="center">
             <a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
                 <img src="https://www.obukhov.ai/img/badges/badge-website.svg">
                 Marigold-LCM is the fast version of Marigold, the state-of-the-art depth estimator for images in the wild.
                 It combines the power of the original Marigold 10-step estimator and the Latent Consistency Models, delivering high-quality results in as little as <b>one step</b>.
                 We provide three functions in this demo: Image, Video, and Bas-relief 3D processing — <b>see the tabs below</b>.
+                Upload your content into the <b>first</b> pane, or click any of the <b>examples</b> below.
+                Wait a second (for images and 3D) or a minute (for videos), and interact with the result in the <b>second</b> pane.
                 To avoid queuing, fork the demo into your profile.
+                <a href="https://huggingface.co/spaces/prs-eth/marigold">The original Marigold demo is also available</a>.
             </p>
         """
         )
                     <p align="justify">
                         This part of the demo uses Marigold-LCM to create a bas-relief model.
                         The models are watertight, with correct normals, and exported in the STL format, which makes them <b>3D-printable</b>.
                     </p>
                     """,
                 )
                         )
                         with gr.Row():
                             bas_submit_btn = gr.Button(value="Create 3D", variant="primary")
                             bas_reset_btn = gr.Button(value="Reset")
                         with gr.Accordion("3D printing demo: Main options", open=True):
                             bas_plane_near = gr.Slider(
             concurrency_limit=1,
         )
         bas_submit_btn.click(
+            fn=process_pipe_bas,
             inputs=[
                 bas_input,
                 bas_plane_near,
                 bas_frame_near,
                 bas_frame_far,
             ],
+            outputs=[bas_output_viewer, bas_output_files],
             concurrency_limit=1,
         )
 def main():
+    CHECKPOINT = "prs-eth/marigold-lcm-v1-0"
     if "HF_TOKEN_LOGIN" in os.environ:
         login(token=os.environ["HF_TOKEN_LOGIN"])
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    pipe = MarigoldDepthConsistencyPipeline.from_pretrained(CHECKPOINT)
     try:
         import xformers

extrude.py CHANGED Viewed

@@ -1,3 +1,23 @@
 import math
 import os

+# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
 import math
 import os

marigold_depth_estimation_lcm.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 Anton Obukhov, Bingxin Ke, ETH Zurich and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -119,6 +119,7 @@ class MarigoldDepthConsistencyPipeline(DiffusionPipeline):
         batch_size: int = 0,
         depth_latent_init: torch.Tensor = None,
         depth_latent_init_strength: float = 0.1,
         seed: int = None,
         color_map: str = "Spectral",
         show_progress_bar: bool = True,
@@ -147,6 +148,8 @@ class MarigoldDepthConsistencyPipeline(DiffusionPipeline):
                 Initial depth map latent for better temporal consistency.
             depth_latent_init_strength (`float`, *optional*, defaults to `0.1`)
                 Degree of initial depth latent influence, must be between 0 and 1.
             seed (`int`, *optional*, defaults to `None`)
                 Reproducibility seed.
             show_progress_bar (`bool`, *optional*, defaults to `True`):
@@ -247,8 +250,11 @@ class MarigoldDepthConsistencyPipeline(DiffusionPipeline):
         min_d = torch.min(depth_pred)
         max_d = torch.max(depth_pred)
         depth_pred = (depth_pred - min_d) / (max_d - min_d)
-        if ensemble_size > 1:
-            depth_latent = self._encode_depth(2 * depth_pred - 1)
         # Convert to numpy
         depth_pred = depth_pred.cpu().numpy().astype(np.float32)
@@ -385,7 +391,7 @@ class MarigoldDepthConsistencyPipeline(DiffusionPipeline):
             ).sample  # [B, 4, h, w]
             # compute the previous noisy sample x_t -> x_t-1
-            depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
         depth = self._decode_depth(depth_latent)

+# Copyright 2024 Bingxin Ke, Anton Obukhov, ETH Zurich and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
         batch_size: int = 0,
         depth_latent_init: torch.Tensor = None,
         depth_latent_init_strength: float = 0.1,
+        return_depth_latent: bool = False,
         seed: int = None,
         color_map: str = "Spectral",
         show_progress_bar: bool = True,
                 Initial depth map latent for better temporal consistency.
             depth_latent_init_strength (`float`, *optional*, defaults to `0.1`)
                 Degree of initial depth latent influence, must be between 0 and 1.
+            return_depth_latent (`bool`, defaults to False)
+                Whether to return the depth latent.
             seed (`int`, *optional*, defaults to `None`)
                 Reproducibility seed.
             show_progress_bar (`bool`, *optional*, defaults to `True`):
         min_d = torch.min(depth_pred)
         max_d = torch.max(depth_pred)
         depth_pred = (depth_pred - min_d) / (max_d - min_d)
+        if return_depth_latent:
+            if ensemble_size > 1:
+                depth_latent = self._encode_depth(2 * depth_pred - 1)
+        else:
+            depth_latent = None
         # Convert to numpy
         depth_pred = depth_pred.cpu().numpy().astype(np.float32)
             ).sample  # [B, 4, h, w]
             # compute the previous noisy sample x_t -> x_t-1
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent, generator=rng).prev_sample
         depth = self._decode_depth(depth_latent)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==4.23.0
 gradio-imageslider==0.0.16
 pygltflib==1.16.1
 trimesh==4.0.5
@@ -6,10 +6,11 @@ imageio
 imageio-ffmpeg
 Pillow
-accelerate==0.28.0
 diffusers==0.27.2
 matplotlib==3.8.2
 scipy==1.11.4
 torch==2.0.1
-transformers==4.39.1
-xformers==0.0.21

+gradio==4.21.0
 gradio-imageslider==0.0.16
 pygltflib==1.16.1
 trimesh==4.0.5
 imageio-ffmpeg
 Pillow
+spaces>=0.25.0
+accelerate>=0.22.0
 diffusers==0.27.2
 matplotlib==3.8.2
 scipy==1.11.4
 torch==2.0.1
+transformers>=4.32.1
+xformers>=0.0.21