ZeqiangLai commited on
Commit
79cc00b
·
1 Parent(s): b530233
hy3dgen/shapegen/__init__.py CHANGED
@@ -13,5 +13,5 @@
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
15
  from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
16
- from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
17
  from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
 
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
15
  from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
16
+ from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
17
  from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
hy3dgen/shapegen/models/__init__.py CHANGED
@@ -25,4 +25,4 @@
25
 
26
  from .autoencoders import ShapeVAE
27
  from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
28
- from .denoisers import HunYuanDiTPlain, Hunyuan3DDiT
 
25
 
26
  from .autoencoders import ShapeVAE
27
  from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
28
+ from .denoisers import Hunyuan3DDiT
hy3dgen/shapegen/models/conditioner.py CHANGED
@@ -22,6 +22,7 @@
22
  # fine-tuning enabling code and other elements of the foregoing made publicly available
23
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
 
 
25
  import torch
26
  import torch.nn as nn
27
  from torchvision import transforms
@@ -33,6 +34,26 @@ from transformers import (
33
  )
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  class ImageEncoder(nn.Module):
37
  def __init__(
38
  self,
@@ -67,7 +88,7 @@ class ImageEncoder(nn.Module):
67
  ]
68
  )
69
 
70
- def forward(self, image, mask=None, value_range=(-1, 1)):
71
  if value_range is not None:
72
  low, high = value_range
73
  image = (image - low) / (high - low)
@@ -82,7 +103,7 @@ class ImageEncoder(nn.Module):
82
 
83
  return last_hidden_state
84
 
85
- def unconditional_embedding(self, batch_size):
86
  device = next(self.model.parameters()).device
87
  dtype = next(self.model.parameters()).dtype
88
  zero = torch.zeros(
@@ -110,11 +131,82 @@ class DinoImageEncoder(ImageEncoder):
110
  std = [0.229, 0.224, 0.225]
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def build_image_encoder(config):
114
  if config['type'] == 'CLIPImageEncoder':
115
  return CLIPImageEncoder(**config['kwargs'])
116
  elif config['type'] == 'DinoImageEncoder':
117
  return DinoImageEncoder(**config['kwargs'])
 
 
118
  else:
119
  raise ValueError(f'Unknown image encoder type: {config["type"]}')
120
 
@@ -129,17 +221,17 @@ class DualImageEncoder(nn.Module):
129
  self.main_image_encoder = build_image_encoder(main_image_encoder)
130
  self.additional_image_encoder = build_image_encoder(additional_image_encoder)
131
 
132
- def forward(self, image, mask=None):
133
  outputs = {
134
- 'main': self.main_image_encoder(image, mask=mask),
135
- 'additional': self.additional_image_encoder(image, mask=mask),
136
  }
137
  return outputs
138
 
139
- def unconditional_embedding(self, batch_size):
140
  outputs = {
141
- 'main': self.main_image_encoder.unconditional_embedding(batch_size),
142
- 'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
143
  }
144
  return outputs
145
 
@@ -152,14 +244,14 @@ class SingleImageEncoder(nn.Module):
152
  super().__init__()
153
  self.main_image_encoder = build_image_encoder(main_image_encoder)
154
 
155
- def forward(self, image, mask=None):
156
  outputs = {
157
- 'main': self.main_image_encoder(image, mask=mask),
158
  }
159
  return outputs
160
 
161
- def unconditional_embedding(self, batch_size):
162
  outputs = {
163
- 'main': self.main_image_encoder.unconditional_embedding(batch_size),
164
  }
165
  return outputs
 
22
  # fine-tuning enabling code and other elements of the foregoing made publicly available
23
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
 
25
+ import numpy as np
26
  import torch
27
  import torch.nn as nn
28
  from torchvision import transforms
 
34
  )
35
 
36
 
37
+ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
38
+ """
39
+ embed_dim: output dimension for each position
40
+ pos: a list of positions to be encoded: size (M,)
41
+ out: (M, D)
42
+ """
43
+ assert embed_dim % 2 == 0
44
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
45
+ omega /= embed_dim / 2.
46
+ omega = 1. / 10000 ** omega # (D/2,)
47
+
48
+ pos = pos.reshape(-1) # (M,)
49
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
50
+
51
+ emb_sin = np.sin(out) # (M, D/2)
52
+ emb_cos = np.cos(out) # (M, D/2)
53
+
54
+ return np.concatenate([emb_sin, emb_cos], axis=1)
55
+
56
+
57
  class ImageEncoder(nn.Module):
58
  def __init__(
59
  self,
 
88
  ]
89
  )
90
 
91
+ def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
92
  if value_range is not None:
93
  low, high = value_range
94
  image = (image - low) / (high - low)
 
103
 
104
  return last_hidden_state
105
 
106
+ def unconditional_embedding(self, batch_size, **kwargs):
107
  device = next(self.model.parameters()).device
108
  dtype = next(self.model.parameters()).dtype
109
  zero = torch.zeros(
 
131
  std = [0.229, 0.224, 0.225]
132
 
133
 
134
+ class DinoImageEncoderMV(DinoImageEncoder):
135
+ def __init__(
136
+ self,
137
+ version=None,
138
+ config=None,
139
+ use_cls_token=True,
140
+ image_size=224,
141
+ view_num=4,
142
+ **kwargs,
143
+ ):
144
+ super().__init__(version, config, use_cls_token, image_size, **kwargs)
145
+ self.view_num = view_num
146
+ self.num_patches = self.num_patches
147
+ pos = np.arange(self.view_num, dtype=np.float32)
148
+ view_embedding = torch.from_numpy(
149
+ get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
150
+
151
+ view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
152
+ self.view_embed = view_embedding.unsqueeze(0)
153
+
154
+ def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
155
+ if value_range is not None:
156
+ low, high = value_range
157
+ image = (image - low) / (high - low)
158
+
159
+ image = image.to(self.model.device, dtype=self.model.dtype)
160
+
161
+ bs, num_views, c, h, w = image.shape
162
+ image = image.view(bs * num_views, c, h, w)
163
+
164
+ inputs = self.transform(image)
165
+ outputs = self.model(inputs)
166
+
167
+ last_hidden_state = outputs.last_hidden_state
168
+ last_hidden_state = last_hidden_state.view(
169
+ bs, num_views, last_hidden_state.shape[-2],
170
+ last_hidden_state.shape[-1]
171
+ )
172
+
173
+ view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
174
+ if view_idxs is not None:
175
+ assert len(view_idxs) == bs
176
+ view_embeddings = []
177
+ for i in range(bs):
178
+ view_idx = view_idxs[i]
179
+ assert num_views == len(view_idx)
180
+ view_embeddings.append(self.view_embed[:, view_idx, ...])
181
+ view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
182
+
183
+ if num_views != self.view_num:
184
+ view_embedding = view_embedding[:, :num_views, ...]
185
+ last_hidden_state = last_hidden_state + view_embedding
186
+ last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
187
+ last_hidden_state.shape[-1])
188
+ return last_hidden_state
189
+
190
+ def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
191
+ device = next(self.model.parameters()).device
192
+ dtype = next(self.model.parameters()).dtype
193
+ zero = torch.zeros(
194
+ batch_size,
195
+ self.num_patches * len(view_idxs[0]),
196
+ self.model.config.hidden_size,
197
+ device=device,
198
+ dtype=dtype,
199
+ )
200
+ return zero
201
+
202
+
203
  def build_image_encoder(config):
204
  if config['type'] == 'CLIPImageEncoder':
205
  return CLIPImageEncoder(**config['kwargs'])
206
  elif config['type'] == 'DinoImageEncoder':
207
  return DinoImageEncoder(**config['kwargs'])
208
+ elif config['type'] == 'DinoImageEncoderMV':
209
+ return DinoImageEncoderMV(**config['kwargs'])
210
  else:
211
  raise ValueError(f'Unknown image encoder type: {config["type"]}')
212
 
 
221
  self.main_image_encoder = build_image_encoder(main_image_encoder)
222
  self.additional_image_encoder = build_image_encoder(additional_image_encoder)
223
 
224
+ def forward(self, image, mask=None, **kwargs):
225
  outputs = {
226
+ 'main': self.main_image_encoder(image, mask=mask, **kwargs),
227
+ 'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
228
  }
229
  return outputs
230
 
231
+ def unconditional_embedding(self, batch_size, **kwargs):
232
  outputs = {
233
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
234
+ 'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
235
  }
236
  return outputs
237
 
 
244
  super().__init__()
245
  self.main_image_encoder = build_image_encoder(main_image_encoder)
246
 
247
+ def forward(self, image, mask=None, **kwargs):
248
  outputs = {
249
+ 'main': self.main_image_encoder(image, mask=mask, **kwargs),
250
  }
251
  return outputs
252
 
253
+ def unconditional_embedding(self, batch_size, **kwargs):
254
  outputs = {
255
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
256
  }
257
  return outputs
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py CHANGED
@@ -60,6 +60,15 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
60
  return embedding
61
 
62
 
 
 
 
 
 
 
 
 
 
63
  class MLPEmbedder(nn.Module):
64
  def __init__(self, in_dim: int, hidden_dim: int):
65
  super().__init__()
@@ -162,7 +171,7 @@ class DoubleStreamBlock(nn.Module):
162
  self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
163
  self.img_mlp = nn.Sequential(
164
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
165
- nn.GELU(approximate="tanh"),
166
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
167
  )
168
 
@@ -173,7 +182,7 @@ class DoubleStreamBlock(nn.Module):
173
  self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
174
  self.txt_mlp = nn.Sequential(
175
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
176
- nn.GELU(approximate="tanh"),
177
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
178
  )
179
 
@@ -239,7 +248,7 @@ class SingleStreamBlock(nn.Module):
239
  self.hidden_size = hidden_size
240
  self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
241
 
242
- self.mlp_act = nn.GELU(approximate="tanh")
243
  self.modulation = Modulation(hidden_size, double=False)
244
 
245
  def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
 
60
  return embedding
61
 
62
 
63
+ class GELU(nn.Module):
64
+ def __init__(self, approximate='tanh'):
65
+ super().__init__()
66
+ self.approximate = approximate
67
+
68
+ def forward(self, x: Tensor) -> Tensor:
69
+ return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
70
+
71
+
72
  class MLPEmbedder(nn.Module):
73
  def __init__(self, in_dim: int, hidden_dim: int):
74
  super().__init__()
 
171
  self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
172
  self.img_mlp = nn.Sequential(
173
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
174
+ GELU(approximate="tanh"),
175
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
176
  )
177
 
 
182
  self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
183
  self.txt_mlp = nn.Sequential(
184
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
185
+ GELU(approximate="tanh"),
186
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
187
  )
188
 
 
248
  self.hidden_size = hidden_size
249
  self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
250
 
251
+ self.mlp_act = GELU(approximate="tanh")
252
  self.modulation = Modulation(hidden_size, double=False)
253
 
254
  def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
hy3dgen/shapegen/pipelines.py CHANGED
@@ -24,11 +24,12 @@ import trimesh
24
  import yaml
25
  from PIL import Image
26
  from diffusers.utils.torch_utils import randn_tensor
 
27
  from tqdm import tqdm
28
 
29
  from .models.autoencoders import ShapeVAE
30
  from .models.autoencoders import SurfaceExtractors
31
- from .utils import logger, synchronize_timer
32
 
33
 
34
  def retrieve_timesteps(
@@ -127,6 +128,9 @@ def instantiate_from_config(config, **kwargs):
127
 
128
 
129
  class Hunyuan3DDiTPipeline:
 
 
 
130
  @classmethod
131
  @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
132
  def from_single_file(
@@ -207,34 +211,12 @@ class Hunyuan3DDiTPipeline:
207
  dtype=dtype,
208
  device=device,
209
  )
210
- original_model_path = model_path
211
- # try local path
212
- base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
213
- model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
214
- logger.info(f'Try to load model from local path: {model_path}')
215
- if not os.path.exists(model_path):
216
- logger.info('Model path not exists, try to download from huggingface')
217
- try:
218
- import huggingface_hub
219
- # download from huggingface
220
- path = huggingface_hub.snapshot_download(repo_id=original_model_path)
221
- model_path = os.path.join(path, subfolder)
222
- except ImportError:
223
- logger.warning(
224
- "You need to install HuggingFace Hub to load models from the hub."
225
- )
226
- raise RuntimeError(f"Model path {model_path} not found")
227
- except Exception as e:
228
- raise e
229
-
230
- if not os.path.exists(model_path):
231
- raise FileNotFoundError(f"Model path {original_model_path} not found")
232
-
233
- extension = 'ckpt' if not use_safetensors else 'safetensors'
234
- variant = '' if variant is None else f'.{variant}'
235
- ckpt_name = f'model{variant}.{extension}'
236
- config_path = os.path.join(model_path, 'config.yaml')
237
- ckpt_path = os.path.join(model_path, ckpt_name)
238
  return cls.from_single_file(
239
  ckpt_path,
240
  config_path,
@@ -279,12 +261,18 @@ class Hunyuan3DDiTPipeline:
279
  if enabled:
280
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
281
  turbo_vae_mapping = {
282
- 'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
283
- 'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s-turbo'
 
284
  }
285
  model_name = model_path.split('/')[-1]
286
  if replace_vae and model_name in turbo_vae_mapping:
287
- self.vae = ShapeVAE.from_pretrained(model_path, subfolder=turbo_vae_mapping[model_name])
 
 
 
 
 
288
  self.vae.enable_flashvdm_decoder(
289
  enabled=enabled,
290
  adaptive_kv_selection=adaptive_kv_selection,
@@ -294,33 +282,146 @@ class Hunyuan3DDiTPipeline:
294
  else:
295
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
296
  vae_mapping = {
297
- 'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
298
- 'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s'
 
299
  }
300
  model_name = model_path.split('/')[-1]
301
  if model_name in vae_mapping:
302
- self.vae = ShapeVAE.from_pretrained(model_path, subfolder=vae_mapping[model_name])
 
303
  self.vae.enable_flashvdm_decoder(enabled=False)
304
 
305
  def to(self, device=None, dtype=None):
306
- if device is not None:
307
- self.device = torch.device(device)
308
- self.vae.to(device)
309
- self.model.to(device)
310
- self.conditioner.to(device)
311
  if dtype is not None:
312
  self.dtype = dtype
313
  self.vae.to(dtype=dtype)
314
  self.model.to(dtype=dtype)
315
  self.conditioner.to(dtype=dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  @synchronize_timer('Encode cond')
318
- def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
319
  bsz = image.shape[0]
320
- cond = self.conditioner(image=image, mask=mask)
321
 
322
  if do_classifier_free_guidance:
323
- un_cond = self.conditioner.unconditional_embedding(bsz)
324
 
325
  if dual_guidance:
326
  un_cond_drop_main = copy.deepcopy(un_cond)
@@ -336,8 +437,6 @@ class Hunyuan3DDiTPipeline:
336
 
337
  cond = cat_recursive(cond, un_cond_drop_main, un_cond)
338
  else:
339
- un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
340
-
341
  def cat_recursive(a, b):
342
  if isinstance(a, torch.Tensor):
343
  return torch.cat([a, b], dim=0).to(self.dtype)
@@ -383,25 +482,27 @@ class Hunyuan3DDiTPipeline:
383
  latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
384
  return latents
385
 
386
- def prepare_image(self, image):
387
  if isinstance(image, str) and not os.path.exists(image):
388
  raise FileNotFoundError(f"Couldn't find image at path {image}")
389
 
390
  if not isinstance(image, list):
391
  image = [image]
392
- image_pts = []
393
- mask_pts = []
394
  for img in image:
395
- image_pt, mask_pt = self.image_processor(img, return_mask=True)
396
- image_pts.append(image_pt)
397
- mask_pts.append(mask_pt)
398
 
399
- image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
400
- if mask_pts[0] is not None:
401
- mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
402
- else:
403
- mask_pts = None
404
- return image_pts, mask_pts
 
 
 
405
 
406
  def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
407
  """
@@ -474,10 +575,14 @@ class Hunyuan3DDiTPipeline:
474
  getattr(self.model, 'guidance_cond_proj_dim', None) is None
475
  dual_guidance = dual_guidance_scale >= 0 and dual_guidance
476
 
477
- image, mask = self.prepare_image(image)
478
- cond = self.encode_cond(image=image,
479
- do_classifier_free_guidance=do_classifier_free_guidance,
480
- dual_guidance=dual_guidance)
 
 
 
 
481
  batch_size = image.shape[0]
482
 
483
  t_dtype = torch.long
@@ -535,7 +640,17 @@ class Hunyuan3DDiTPipeline:
535
  box_v, mc_level, num_chunks, octree_resolution, mc_algo,
536
  )
537
 
538
- def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=True):
 
 
 
 
 
 
 
 
 
 
539
  if not output_type == "latent":
540
  latents = 1. / self.vae.scale_factor * latents
541
  latents = self.vae(latents)
@@ -562,7 +677,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
562
  @torch.inference_mode()
563
  def __call__(
564
  self,
565
- image: Union[str, List[str], Image.Image] = None,
566
  num_inference_steps: int = 50,
567
  timesteps: List[int] = None,
568
  sigmas: List[float] = None,
@@ -590,10 +705,11 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
590
  self.model.guidance_embed is True
591
  )
592
 
593
- image, mask = self.prepare_image(image)
 
594
  cond = self.encode_cond(
595
  image=image,
596
- mask=mask,
597
  do_classifier_free_guidance=do_classifier_free_guidance,
598
  dual_guidance=False,
599
  )
 
24
  import yaml
25
  from PIL import Image
26
  from diffusers.utils.torch_utils import randn_tensor
27
+ from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
28
  from tqdm import tqdm
29
 
30
  from .models.autoencoders import ShapeVAE
31
  from .models.autoencoders import SurfaceExtractors
32
+ from .utils import logger, synchronize_timer, smart_load_model
33
 
34
 
35
  def retrieve_timesteps(
 
128
 
129
 
130
  class Hunyuan3DDiTPipeline:
131
+ model_cpu_offload_seq = "conditioner->model->vae"
132
+ _exclude_from_cpu_offload = []
133
+
134
  @classmethod
135
  @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
136
  def from_single_file(
 
211
  dtype=dtype,
212
  device=device,
213
  )
214
+ config_path, ckpt_path = smart_load_model(
215
+ model_path,
216
+ subfolder=subfolder,
217
+ use_safetensors=use_safetensors,
218
+ variant=variant
219
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  return cls.from_single_file(
221
  ckpt_path,
222
  config_path,
 
261
  if enabled:
262
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
263
  turbo_vae_mapping = {
264
+ 'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
265
+ 'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
266
+ 'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
267
  }
268
  model_name = model_path.split('/')[-1]
269
  if replace_vae and model_name in turbo_vae_mapping:
270
+ model_path, subfolder = turbo_vae_mapping[model_name]
271
+ self.vae = ShapeVAE.from_pretrained(
272
+ model_path, subfolder=subfolder,
273
+ use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
274
+ device=self.device,
275
+ )
276
  self.vae.enable_flashvdm_decoder(
277
  enabled=enabled,
278
  adaptive_kv_selection=adaptive_kv_selection,
 
282
  else:
283
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
284
  vae_mapping = {
285
+ 'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
286
+ 'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
287
+ 'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
288
  }
289
  model_name = model_path.split('/')[-1]
290
  if model_name in vae_mapping:
291
+ model_path, subfolder = vae_mapping[model_name]
292
+ self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
293
  self.vae.enable_flashvdm_decoder(enabled=False)
294
 
295
  def to(self, device=None, dtype=None):
 
 
 
 
 
296
  if dtype is not None:
297
  self.dtype = dtype
298
  self.vae.to(dtype=dtype)
299
  self.model.to(dtype=dtype)
300
  self.conditioner.to(dtype=dtype)
301
+ if device is not None:
302
+ self.device = torch.device(device)
303
+ self.vae.to(device)
304
+ self.model.to(device)
305
+ self.conditioner.to(device)
306
+
307
+ @property
308
+ def _execution_device(self):
309
+ r"""
310
+ Returns the device on which the pipeline's models will be executed. After calling
311
+ [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
312
+ Accelerate's module hooks.
313
+ """
314
+ for name, model in self.components.items():
315
+ if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
316
+ continue
317
+
318
+ if not hasattr(model, "_hf_hook"):
319
+ return self.device
320
+ for module in model.modules():
321
+ if (
322
+ hasattr(module, "_hf_hook")
323
+ and hasattr(module._hf_hook, "execution_device")
324
+ and module._hf_hook.execution_device is not None
325
+ ):
326
+ return torch.device(module._hf_hook.execution_device)
327
+ return self.device
328
+
329
+ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
330
+ r"""
331
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
332
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
333
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
334
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
335
+
336
+ Arguments:
337
+ gpu_id (`int`, *optional*):
338
+ The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
339
+ device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
340
+ The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
341
+ default to "cuda".
342
+ """
343
+ if self.model_cpu_offload_seq is None:
344
+ raise ValueError(
345
+ "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
346
+ )
347
+
348
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
349
+ from accelerate import cpu_offload_with_hook
350
+ else:
351
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
352
+
353
+ torch_device = torch.device(device)
354
+ device_index = torch_device.index
355
+
356
+ if gpu_id is not None and device_index is not None:
357
+ raise ValueError(
358
+ f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
359
+ f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
360
+ )
361
+
362
+ # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
363
+ self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
364
+
365
+ device_type = torch_device.type
366
+ device = torch.device(f"{device_type}:{self._offload_gpu_id}")
367
+
368
+ if self.device.type != "cpu":
369
+ self.to("cpu")
370
+ device_mod = getattr(torch, self.device.type, None)
371
+ if hasattr(device_mod, "empty_cache") and device_mod.is_available():
372
+ device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
373
+
374
+ all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
375
+
376
+ self._all_hooks = []
377
+ hook = None
378
+ for model_str in self.model_cpu_offload_seq.split("->"):
379
+ model = all_model_components.pop(model_str, None)
380
+ if not isinstance(model, torch.nn.Module):
381
+ continue
382
+
383
+ _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
384
+ self._all_hooks.append(hook)
385
+
386
+ # CPU offload models that are not in the seq chain unless they are explicitly excluded
387
+ # these models will stay on CPU until maybe_free_model_hooks is called
388
+ # some models cannot be in the seq chain because they are iteratively called, such as controlnet
389
+ for name, model in all_model_components.items():
390
+ if not isinstance(model, torch.nn.Module):
391
+ continue
392
+
393
+ if name in self._exclude_from_cpu_offload:
394
+ model.to(device)
395
+ else:
396
+ _, hook = cpu_offload_with_hook(model, device)
397
+ self._all_hooks.append(hook)
398
+
399
+ def maybe_free_model_hooks(self):
400
+ r"""
401
+ Function that offloads all components, removes all model hooks that were added when using
402
+ `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
403
+ is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
404
+ functions correctly when applying enable_model_cpu_offload.
405
+ """
406
+ if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
407
+ # `enable_model_cpu_offload` has not be called, so silently do nothing
408
+ return
409
+
410
+ for hook in self._all_hooks:
411
+ # offload model and remove hook from model
412
+ hook.offload()
413
+ hook.remove()
414
+
415
+ # make sure the model is in the same state as before calling it
416
+ self.enable_model_cpu_offload()
417
 
418
  @synchronize_timer('Encode cond')
419
+ def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
420
  bsz = image.shape[0]
421
+ cond = self.conditioner(image=image, **additional_cond_inputs)
422
 
423
  if do_classifier_free_guidance:
424
+ un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
425
 
426
  if dual_guidance:
427
  un_cond_drop_main = copy.deepcopy(un_cond)
 
437
 
438
  cond = cat_recursive(cond, un_cond_drop_main, un_cond)
439
  else:
 
 
440
  def cat_recursive(a, b):
441
  if isinstance(a, torch.Tensor):
442
  return torch.cat([a, b], dim=0).to(self.dtype)
 
482
  latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
483
  return latents
484
 
485
+ def prepare_image(self, image) -> dict:
486
  if isinstance(image, str) and not os.path.exists(image):
487
  raise FileNotFoundError(f"Couldn't find image at path {image}")
488
 
489
  if not isinstance(image, list):
490
  image = [image]
491
+
492
+ outputs = []
493
  for img in image:
494
+ output = self.image_processor(img)
495
+ outputs.append(output)
 
496
 
497
+ cond_input = {k: [] for k in outputs[0].keys()}
498
+ for output in outputs:
499
+ for key, value in output.items():
500
+ cond_input[key].append(value)
501
+ for key, value in cond_input.items():
502
+ if isinstance(value[0], torch.Tensor):
503
+ cond_input[key] = torch.cat(value, dim=0)
504
+
505
+ return cond_input
506
 
507
  def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
508
  """
 
575
  getattr(self.model, 'guidance_cond_proj_dim', None) is None
576
  dual_guidance = dual_guidance_scale >= 0 and dual_guidance
577
 
578
+ cond_inputs = self.prepare_image(image)
579
+ image = cond_inputs.pop('image')
580
+ cond = self.encode_cond(
581
+ image=image,
582
+ additional_cond_inputs=cond_inputs,
583
+ do_classifier_free_guidance=do_classifier_free_guidance,
584
+ dual_guidance=False,
585
+ )
586
  batch_size = image.shape[0]
587
 
588
  t_dtype = torch.long
 
640
  box_v, mc_level, num_chunks, octree_resolution, mc_algo,
641
  )
642
 
643
+ def _export(
644
+ self,
645
+ latents,
646
+ output_type='trimesh',
647
+ box_v=1.01,
648
+ mc_level=0.0,
649
+ num_chunks=20000,
650
+ octree_resolution=256,
651
+ mc_algo='mc',
652
+ enable_pbar=True
653
+ ):
654
  if not output_type == "latent":
655
  latents = 1. / self.vae.scale_factor * latents
656
  latents = self.vae(latents)
 
677
  @torch.inference_mode()
678
  def __call__(
679
  self,
680
+ image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
681
  num_inference_steps: int = 50,
682
  timesteps: List[int] = None,
683
  sigmas: List[float] = None,
 
705
  self.model.guidance_embed is True
706
  )
707
 
708
+ cond_inputs = self.prepare_image(image)
709
+ image = cond_inputs.pop('image')
710
  cond = self.encode_cond(
711
  image=image,
712
+ additional_cond_inputs=cond_inputs,
713
  do_classifier_free_guidance=do_classifier_free_guidance,
714
  dual_guidance=False,
715
  )
hy3dgen/shapegen/postprocessors.py CHANGED
@@ -12,13 +12,16 @@
12
  # fine-tuning enabling code and other elements of the foregoing made publicly available
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
 
15
  import tempfile
16
  from typing import Union
17
 
 
18
  import pymeshlab
 
19
  import trimesh
20
 
21
- from .models.vae import Latent2MeshOutput
22
  from .utils import synchronize_timer
23
 
24
 
 
12
  # fine-tuning enabling code and other elements of the foregoing made publicly available
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
15
+ import os
16
  import tempfile
17
  from typing import Union
18
 
19
+ import numpy as np
20
  import pymeshlab
21
+ import torch
22
  import trimesh
23
 
24
+ from .models.autoencoders import Latent2MeshOutput
25
  from .utils import synchronize_timer
26
 
27
 
hy3dgen/shapegen/preprocessors.py CHANGED
@@ -87,9 +87,7 @@ class ImageProcessorV2:
87
  mask = mask.clip(0, 255).astype(np.uint8)
88
  return result, mask
89
 
90
- def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
91
- if self.border_ratio is not None:
92
- border_ratio = self.border_ratio
93
  if isinstance(image, str):
94
  image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
95
  image, mask = self.recenter(image, border_ratio=border_ratio)
@@ -106,13 +104,64 @@ class ImageProcessorV2:
106
  if to_tensor:
107
  image = array_to_tensor(image)
108
  mask = array_to_tensor(mask)
109
- if return_mask:
110
- return image, mask
111
- return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  IMAGE_PROCESSORS = {
115
  "v2": ImageProcessorV2,
 
116
  }
117
 
118
  DEFAULT_IMAGEPROCESSOR = 'v2'
 
87
  mask = mask.clip(0, 255).astype(np.uint8)
88
  return result, mask
89
 
90
+ def load_image(self, image, border_ratio=0.15, to_tensor=True):
 
 
91
  if isinstance(image, str):
92
  image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
93
  image, mask = self.recenter(image, border_ratio=border_ratio)
 
104
  if to_tensor:
105
  image = array_to_tensor(image)
106
  mask = array_to_tensor(mask)
107
+ return image, mask
108
+
109
+ def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
110
+ if self.border_ratio is not None:
111
+ border_ratio = self.border_ratio
112
+ image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
113
+ outputs = {
114
+ 'image': image,
115
+ 'mask': mask
116
+ }
117
+ return outputs
118
+
119
+
120
+ class MVImageProcessorV2(ImageProcessorV2):
121
+ """
122
+ view order: front, front clockwise 90, back, front clockwise 270
123
+ """
124
+ return_view_idx = True
125
+
126
+ def __init__(self, size=512, border_ratio=None):
127
+ super().__init__(size, border_ratio)
128
+ self.view2idx = {
129
+ 'front': 0,
130
+ 'left': 1,
131
+ 'back': 2,
132
+ 'right': 3
133
+ }
134
+
135
+ def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
136
+ if self.border_ratio is not None:
137
+ border_ratio = self.border_ratio
138
+
139
+ images = []
140
+ masks = []
141
+ view_idxs = []
142
+ for idx, (view_tag, image) in enumerate(image_dict.items()):
143
+ view_idxs.append(self.view2idx[view_tag])
144
+ image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
145
+ images.append(image)
146
+ masks.append(mask)
147
+
148
+ zipped_lists = zip(view_idxs, images, masks)
149
+ sorted_zipped_lists = sorted(zipped_lists)
150
+ view_idxs, images, masks = zip(*sorted_zipped_lists)
151
+
152
+ image = torch.cat(images, 0).unsqueeze(0)
153
+ mask = torch.cat(masks, 0).unsqueeze(0)
154
+ outputs = {
155
+ 'image': image,
156
+ 'mask': mask,
157
+ 'view_idxs': view_idxs
158
+ }
159
+ return outputs
160
 
161
 
162
  IMAGE_PROCESSORS = {
163
  "v2": ImageProcessorV2,
164
+ 'mv_v2': MVImageProcessorV2,
165
  }
166
 
167
  DEFAULT_IMAGEPROCESSOR = 'v2'