Spaces:
Running
on
Zero
Running
on
Zero
Update hy3dgen/shapegen/pipelines.py
Browse files- hy3dgen/shapegen/pipelines.py +169 -34
hy3dgen/shapegen/pipelines.py
CHANGED
@@ -34,11 +34,12 @@ import trimesh
|
|
34 |
import yaml
|
35 |
from PIL import Image
|
36 |
from diffusers.utils.torch_utils import randn_tensor
|
|
|
37 |
from tqdm import tqdm
|
38 |
|
39 |
from .models.autoencoders import ShapeVAE
|
40 |
from .models.autoencoders import SurfaceExtractors
|
41 |
-
from .utils import logger, synchronize_timer
|
42 |
|
43 |
|
44 |
def retrieve_timesteps(
|
@@ -137,6 +138,9 @@ def instantiate_from_config(config, **kwargs):
|
|
137 |
|
138 |
|
139 |
class Hunyuan3DDiTPipeline:
|
|
|
|
|
|
|
140 |
@classmethod
|
141 |
@synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
|
142 |
def from_single_file(
|
@@ -217,34 +221,12 @@ class Hunyuan3DDiTPipeline:
|
|
217 |
dtype=dtype,
|
218 |
device=device,
|
219 |
)
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
logger.info('Model path not exists, try to download from huggingface')
|
227 |
-
try:
|
228 |
-
import huggingface_hub
|
229 |
-
# download from huggingface
|
230 |
-
path = huggingface_hub.snapshot_download(repo_id=original_model_path)
|
231 |
-
model_path = os.path.join(path, subfolder)
|
232 |
-
except ImportError:
|
233 |
-
logger.warning(
|
234 |
-
"You need to install HuggingFace Hub to load models from the hub."
|
235 |
-
)
|
236 |
-
raise RuntimeError(f"Model path {model_path} not found")
|
237 |
-
except Exception as e:
|
238 |
-
raise e
|
239 |
-
|
240 |
-
if not os.path.exists(model_path):
|
241 |
-
raise FileNotFoundError(f"Model path {original_model_path} not found")
|
242 |
-
|
243 |
-
extension = 'ckpt' if not use_safetensors else 'safetensors'
|
244 |
-
variant = '' if variant is None else f'.{variant}'
|
245 |
-
ckpt_name = f'model{variant}.{extension}'
|
246 |
-
config_path = os.path.join(model_path, 'config.yaml')
|
247 |
-
ckpt_path = os.path.join(model_path, ckpt_name)
|
248 |
return cls.from_single_file(
|
249 |
ckpt_path,
|
250 |
config_path,
|
@@ -278,17 +260,170 @@ class Hunyuan3DDiTPipeline:
|
|
278 |
self.model = torch.compile(self.model)
|
279 |
self.conditioner = torch.compile(self.conditioner)
|
280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
def to(self, device=None, dtype=None):
|
282 |
-
if device is not None:
|
283 |
-
self.device = torch.device(device)
|
284 |
-
self.vae.to(device)
|
285 |
-
self.model.to(device)
|
286 |
-
self.conditioner.to(device)
|
287 |
if dtype is not None:
|
288 |
self.dtype = dtype
|
289 |
self.vae.to(dtype=dtype)
|
290 |
self.model.to(dtype=dtype)
|
291 |
self.conditioner.to(dtype=dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
@synchronize_timer('Encode cond')
|
294 |
def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
|
|
|
34 |
import yaml
|
35 |
from PIL import Image
|
36 |
from diffusers.utils.torch_utils import randn_tensor
|
37 |
+
from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
|
38 |
from tqdm import tqdm
|
39 |
|
40 |
from .models.autoencoders import ShapeVAE
|
41 |
from .models.autoencoders import SurfaceExtractors
|
42 |
+
from .utils import logger, synchronize_timer, smart_load_model
|
43 |
|
44 |
|
45 |
def retrieve_timesteps(
|
|
|
138 |
|
139 |
|
140 |
class Hunyuan3DDiTPipeline:
|
141 |
+
model_cpu_offload_seq = "conditioner->model->vae"
|
142 |
+
_exclude_from_cpu_offload = []
|
143 |
+
|
144 |
@classmethod
|
145 |
@synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
|
146 |
def from_single_file(
|
|
|
221 |
dtype=dtype,
|
222 |
device=device,
|
223 |
)
|
224 |
+
config_path, ckpt_path = smart_load_model(
|
225 |
+
model_path,
|
226 |
+
subfolder=subfolder,
|
227 |
+
use_safetensors=use_safetensors,
|
228 |
+
variant=variant
|
229 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
return cls.from_single_file(
|
231 |
ckpt_path,
|
232 |
config_path,
|
|
|
260 |
self.model = torch.compile(self.model)
|
261 |
self.conditioner = torch.compile(self.conditioner)
|
262 |
|
263 |
+
def enable_flashvdm(
|
264 |
+
self,
|
265 |
+
enabled: bool = True,
|
266 |
+
adaptive_kv_selection=True,
|
267 |
+
topk_mode='mean',
|
268 |
+
mc_algo='dmc',
|
269 |
+
replace_vae=True,
|
270 |
+
):
|
271 |
+
if enabled:
|
272 |
+
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
273 |
+
turbo_vae_mapping = {
|
274 |
+
'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
|
275 |
+
'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
|
276 |
+
'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
|
277 |
+
}
|
278 |
+
model_name = model_path.split('/')[-1]
|
279 |
+
if replace_vae and model_name in turbo_vae_mapping:
|
280 |
+
model_path, subfolder = turbo_vae_mapping[model_name]
|
281 |
+
self.vae = ShapeVAE.from_pretrained(
|
282 |
+
model_path, subfolder=subfolder,
|
283 |
+
use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
|
284 |
+
device=self.device,
|
285 |
+
)
|
286 |
+
self.vae.enable_flashvdm_decoder(
|
287 |
+
enabled=enabled,
|
288 |
+
adaptive_kv_selection=adaptive_kv_selection,
|
289 |
+
topk_mode=topk_mode,
|
290 |
+
mc_algo=mc_algo
|
291 |
+
)
|
292 |
+
else:
|
293 |
+
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
294 |
+
vae_mapping = {
|
295 |
+
'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
|
296 |
+
'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
|
297 |
+
'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
|
298 |
+
}
|
299 |
+
model_name = model_path.split('/')[-1]
|
300 |
+
if model_name in vae_mapping:
|
301 |
+
model_path, subfolder = vae_mapping[model_name]
|
302 |
+
self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
|
303 |
+
self.vae.enable_flashvdm_decoder(enabled=False)
|
304 |
+
|
305 |
def to(self, device=None, dtype=None):
|
|
|
|
|
|
|
|
|
|
|
306 |
if dtype is not None:
|
307 |
self.dtype = dtype
|
308 |
self.vae.to(dtype=dtype)
|
309 |
self.model.to(dtype=dtype)
|
310 |
self.conditioner.to(dtype=dtype)
|
311 |
+
if device is not None:
|
312 |
+
self.device = torch.device(device)
|
313 |
+
self.vae.to(device)
|
314 |
+
self.model.to(device)
|
315 |
+
self.conditioner.to(device)
|
316 |
+
|
317 |
+
@property
|
318 |
+
def _execution_device(self):
|
319 |
+
r"""
|
320 |
+
Returns the device on which the pipeline's models will be executed. After calling
|
321 |
+
[`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
|
322 |
+
Accelerate's module hooks.
|
323 |
+
"""
|
324 |
+
for name, model in self.components.items():
|
325 |
+
if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
|
326 |
+
continue
|
327 |
+
|
328 |
+
if not hasattr(model, "_hf_hook"):
|
329 |
+
return self.device
|
330 |
+
for module in model.modules():
|
331 |
+
if (
|
332 |
+
hasattr(module, "_hf_hook")
|
333 |
+
and hasattr(module._hf_hook, "execution_device")
|
334 |
+
and module._hf_hook.execution_device is not None
|
335 |
+
):
|
336 |
+
return torch.device(module._hf_hook.execution_device)
|
337 |
+
return self.device
|
338 |
+
|
339 |
+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
340 |
+
r"""
|
341 |
+
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
342 |
+
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
343 |
+
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
344 |
+
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
345 |
+
|
346 |
+
Arguments:
|
347 |
+
gpu_id (`int`, *optional*):
|
348 |
+
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
|
349 |
+
device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
|
350 |
+
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
|
351 |
+
default to "cuda".
|
352 |
+
"""
|
353 |
+
if self.model_cpu_offload_seq is None:
|
354 |
+
raise ValueError(
|
355 |
+
"Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
|
356 |
+
)
|
357 |
+
|
358 |
+
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
|
359 |
+
from accelerate import cpu_offload_with_hook
|
360 |
+
else:
|
361 |
+
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
362 |
+
|
363 |
+
torch_device = torch.device(device)
|
364 |
+
device_index = torch_device.index
|
365 |
+
|
366 |
+
if gpu_id is not None and device_index is not None:
|
367 |
+
raise ValueError(
|
368 |
+
f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
|
369 |
+
f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
|
370 |
+
)
|
371 |
+
|
372 |
+
# _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
|
373 |
+
self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
|
374 |
+
|
375 |
+
device_type = torch_device.type
|
376 |
+
device = torch.device(f"{device_type}:{self._offload_gpu_id}")
|
377 |
+
|
378 |
+
if self.device.type != "cpu":
|
379 |
+
self.to("cpu")
|
380 |
+
device_mod = getattr(torch, self.device.type, None)
|
381 |
+
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
|
382 |
+
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
383 |
+
|
384 |
+
all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
|
385 |
+
|
386 |
+
self._all_hooks = []
|
387 |
+
hook = None
|
388 |
+
for model_str in self.model_cpu_offload_seq.split("->"):
|
389 |
+
model = all_model_components.pop(model_str, None)
|
390 |
+
if not isinstance(model, torch.nn.Module):
|
391 |
+
continue
|
392 |
+
|
393 |
+
_, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
|
394 |
+
self._all_hooks.append(hook)
|
395 |
+
|
396 |
+
# CPU offload models that are not in the seq chain unless they are explicitly excluded
|
397 |
+
# these models will stay on CPU until maybe_free_model_hooks is called
|
398 |
+
# some models cannot be in the seq chain because they are iteratively called, such as controlnet
|
399 |
+
for name, model in all_model_components.items():
|
400 |
+
if not isinstance(model, torch.nn.Module):
|
401 |
+
continue
|
402 |
+
|
403 |
+
if name in self._exclude_from_cpu_offload:
|
404 |
+
model.to(device)
|
405 |
+
else:
|
406 |
+
_, hook = cpu_offload_with_hook(model, device)
|
407 |
+
self._all_hooks.append(hook)
|
408 |
+
|
409 |
+
def maybe_free_model_hooks(self):
|
410 |
+
r"""
|
411 |
+
Function that offloads all components, removes all model hooks that were added when using
|
412 |
+
`enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
|
413 |
+
is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
|
414 |
+
functions correctly when applying enable_model_cpu_offload.
|
415 |
+
"""
|
416 |
+
if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
|
417 |
+
# `enable_model_cpu_offload` has not be called, so silently do nothing
|
418 |
+
return
|
419 |
+
|
420 |
+
for hook in self._all_hooks:
|
421 |
+
# offload model and remove hook from model
|
422 |
+
hook.offload()
|
423 |
+
hook.remove()
|
424 |
+
|
425 |
+
# make sure the model is in the same state as before calling it
|
426 |
+
self.enable_model_cpu_offload()
|
427 |
|
428 |
@synchronize_timer('Encode cond')
|
429 |
def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
|