Spaces:

tencent
/

Hunyuan3D-2mv

Running on Zero

App Files Files Community

ZeqiangLai commited on Mar 18

Commit

8839214

verified ·

1 Parent(s): 79aad46

Update hy3dgen/shapegen/pipelines.py

Browse files

Files changed (1) hide show

hy3dgen/shapegen/pipelines.py +169 -34

hy3dgen/shapegen/pipelines.py CHANGED Viewed

@@ -34,11 +34,12 @@ import trimesh
 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
-from .utils import logger, synchronize_timer
 def retrieve_timesteps(
@@ -137,6 +138,9 @@ def instantiate_from_config(config, **kwargs):
 class Hunyuan3DDiTPipeline:
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
@@ -217,34 +221,12 @@ class Hunyuan3DDiTPipeline:
             dtype=dtype,
             device=device,
         )
-        original_model_path = model_path
-        # try local path
-        base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
-        model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
-        logger.info(f'Try to load model from local path: {model_path}')
-        if not os.path.exists(model_path):
-            logger.info('Model path not exists, try to download from huggingface')
-            try:
-                import huggingface_hub
-                # download from huggingface
-                path = huggingface_hub.snapshot_download(repo_id=original_model_path)
-                model_path = os.path.join(path, subfolder)
-            except ImportError:
-                logger.warning(
-                    "You need to install HuggingFace Hub to load models from the hub."
-                )
-                raise RuntimeError(f"Model path {model_path} not found")
-            except Exception as e:
-                raise e
-        if not os.path.exists(model_path):
-            raise FileNotFoundError(f"Model path {original_model_path} not found")
-        extension = 'ckpt' if not use_safetensors else 'safetensors'
-        variant = '' if variant is None else f'.{variant}'
-        ckpt_name = f'model{variant}.{extension}'
-        config_path = os.path.join(model_path, 'config.yaml')
-        ckpt_path = os.path.join(model_path, ckpt_name)
         return cls.from_single_file(
             ckpt_path,
             config_path,
@@ -278,17 +260,170 @@ class Hunyuan3DDiTPipeline:
         self.model = torch.compile(self.model)
         self.conditioner = torch.compile(self.conditioner)
     def to(self, device=None, dtype=None):
-        if device is not None:
-            self.device = torch.device(device)
-            self.vae.to(device)
-            self.model.to(device)
-            self.conditioner.to(device)
         if dtype is not None:
             self.dtype = dtype
             self.vae.to(dtype=dtype)
             self.model.to(dtype=dtype)
             self.conditioner.to(dtype=dtype)
     @synchronize_timer('Encode cond')
     def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):

 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
+from .utils import logger, synchronize_timer, smart_load_model
 def retrieve_timesteps(
 class Hunyuan3DDiTPipeline:
+    model_cpu_offload_seq = "conditioner->model->vae"
+    _exclude_from_cpu_offload = []
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
             dtype=dtype,
             device=device,
         )
+        config_path, ckpt_path = smart_load_model(
+            model_path,
+            subfolder=subfolder,
+            use_safetensors=use_safetensors,
+            variant=variant
+        )
         return cls.from_single_file(
             ckpt_path,
             config_path,
         self.model = torch.compile(self.model)
         self.conditioner = torch.compile(self.conditioner)
+    def enable_flashvdm(
+        self,
+        enabled: bool = True,
+        adaptive_kv_selection=True,
+        topk_mode='mean',
+        mc_algo='dmc',
+        replace_vae=True,
+    ):
+        if enabled:
+            model_path = self.kwargs['from_pretrained_kwargs']['model_path']
+            turbo_vae_mapping = {
+                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
+                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
+                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
+            }
+            model_name = model_path.split('/')[-1]
+            if replace_vae and model_name in turbo_vae_mapping:
+                model_path, subfolder = turbo_vae_mapping[model_name]
+                self.vae = ShapeVAE.from_pretrained(
+                    model_path, subfolder=subfolder,
+                    use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
+                    device=self.device,
+                )
+            self.vae.enable_flashvdm_decoder(
+                enabled=enabled,
+                adaptive_kv_selection=adaptive_kv_selection,
+                topk_mode=topk_mode,
+                mc_algo=mc_algo
+            )
+        else:
+            model_path = self.kwargs['from_pretrained_kwargs']['model_path']
+            vae_mapping = {
+                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
+                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
+                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
+            }
+            model_name = model_path.split('/')[-1]
+            if model_name in vae_mapping:
+                model_path, subfolder = vae_mapping[model_name]
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
+            self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
         if dtype is not None:
             self.dtype = dtype
             self.vae.to(dtype=dtype)
             self.model.to(dtype=dtype)
             self.conditioner.to(dtype=dtype)
+        if device is not None:
+            self.device = torch.device(device)
+            self.vae.to(device)
+            self.model.to(device)
+            self.conditioner.to(device)
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
+        Accelerate's module hooks.
+        """
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
+                continue
+            if not hasattr(model, "_hf_hook"):
+                return self.device
+            for module in model.modules():
+                if (
+                    hasattr(module, "_hf_hook")
+                    and hasattr(module._hf_hook, "execution_device")
+                    and module._hf_hook.execution_device is not None
+                ):
+                    return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        if self.model_cpu_offload_seq is None:
+            raise ValueError(
+                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
+            )
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu")
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
+        self._all_hooks = []
+        hook = None
+        for model_str in self.model_cpu_offload_seq.split("->"):
+            model = all_model_components.pop(model_str, None)
+            if not isinstance(model, torch.nn.Module):
+                continue
+            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
+            self._all_hooks.append(hook)
+        # CPU offload models that are not in the seq chain unless they are explicitly excluded
+        # these models will stay on CPU until maybe_free_model_hooks is called
+        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
+        for name, model in all_model_components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                _, hook = cpu_offload_with_hook(model, device)
+                self._all_hooks.append(hook)
+    def maybe_free_model_hooks(self):
+        r"""
+        Function that offloads all components, removes all model hooks that were added when using
+        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
+        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
+        functions correctly when applying enable_model_cpu_offload.
+        """
+        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
+            # `enable_model_cpu_offload` has not be called, so silently do nothing
+            return
+        for hook in self._all_hooks:
+            # offload model and remove hook from model
+            hook.offload()
+            hook.remove()
+        # make sure the model is in the same state as before calling it
+        self.enable_model_cpu_offload()
     @synchronize_timer('Encode cond')
     def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):