Spaces:

tencent
/

Hunyuan3D-2mv

Runtime error

App Files Files Community

ZeqiangLai commited on Mar 19

Commit

6b88e7a

verified ·

1 Parent(s): f2802bb

Upload 16 files

Browse files

Files changed (13) hide show

hy3dgen/shapegen/__init__.py +1 -1
hy3dgen/shapegen/models/__init__.py +1 -1
hy3dgen/shapegen/models/autoencoders/__init__.py +3 -3
hy3dgen/shapegen/models/autoencoders/attention_processors.py +92 -0
hy3dgen/shapegen/models/autoencoders/model.py +20 -1
hy3dgen/shapegen/models/autoencoders/volume_decoders.py +353 -0
hy3dgen/shapegen/models/conditioner.py +12 -104
hy3dgen/shapegen/models/denoisers/__init__.py +1 -0
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py +3 -12
hy3dgen/shapegen/pipelines.py +61 -174
hy3dgen/shapegen/postprocessors.py +1 -63
hy3dgen/shapegen/preprocessors.py +4 -65
hy3dgen/shapegen/utils.py +0 -37

hy3dgen/shapegen/__init__.py CHANGED Viewed

@@ -23,5 +23,5 @@
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
-from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
 from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR

 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
+from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
 from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR

hy3dgen/shapegen/models/__init__.py CHANGED Viewed

@@ -25,4 +25,4 @@
 from .autoencoders import ShapeVAE
 from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
-from .denoisers import Hunyuan3DDiT

 from .autoencoders import ShapeVAE
 from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
+from .denoisers import HunYuanDiTPlain, Hunyuan3DDiT

hy3dgen/shapegen/models/autoencoders/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .attention_blocks import CrossAttentionDecoder
-from .attention_processors import CrossAttentionProcessor
 from .model import ShapeVAE, VectsetVAE
-from .surface_extractors import SurfaceExtractors, MCSurfaceExtractor, DMCSurfaceExtractor, Latent2MeshOutput
-from .volume_decoders import  VanillaVolumeDecoder

 from .attention_blocks import CrossAttentionDecoder
+from .attention_processors import FlashVDMCrossAttentionProcessor, CrossAttentionProcessor, FlashVDMTopMCrossAttentionProcessor
 from .model import ShapeVAE, VectsetVAE
+from .surface_extractors import SurfaceExtractors, MCSurfaceExtractor, DMCSurfaceExtractor
+from .volume_decoders import HierarchicalVolumeDecoding, FlashVDMVolumeDecoding, VanillaVolumeDecoder

hy3dgen/shapegen/models/autoencoders/attention_processors.py CHANGED Viewed

@@ -17,3 +17,95 @@ class CrossAttentionProcessor:
         out = scaled_dot_product_attention(q, k, v)
         return out

         out = scaled_dot_product_attention(q, k, v)
         return out
+class FlashVDMCrossAttentionProcessor:
+    def __init__(self, topk=None):
+        self.topk = topk
+    def __call__(self, attn, q, k, v):
+        if k.shape[-2] == 3072:
+            topk = 1024
+        elif k.shape[-2] == 512:
+            topk = 256
+        else:
+            topk = k.shape[-2] // 3
+        if self.topk is True:
+            q1 = q[:, :, ::100, :]
+            sim = q1 @ k.transpose(-1, -2)
+            sim = torch.mean(sim, -2)
+            topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1)
+            topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1])
+            v0 = torch.gather(v, dim=-2, index=topk_ind)
+            k0 = torch.gather(k, dim=-2, index=topk_ind)
+            out = scaled_dot_product_attention(q, k0, v0)
+        elif self.topk is False:
+            out = scaled_dot_product_attention(q, k, v)
+        else:
+            idx, counts = self.topk
+            start = 0
+            outs = []
+            for grid_coord, count in zip(idx, counts):
+                end = start + count
+                q_chunk = q[:, :, start:end, :]
+                q1 = q_chunk[:, :, ::50, :]
+                sim = q1 @ k.transpose(-1, -2)
+                sim = torch.mean(sim, -2)
+                topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1)
+                topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1])
+                v0 = torch.gather(v, dim=-2, index=topk_ind)
+                k0 = torch.gather(k, dim=-2, index=topk_ind)
+                out = scaled_dot_product_attention(q_chunk, k0, v0)
+                outs.append(out)
+                start += count
+            out = torch.cat(outs, dim=-2)
+        self.topk = False
+        return out
+class FlashVDMTopMCrossAttentionProcessor:
+    def __init__(self, topk=None):
+        self.topk = topk
+    def __call__(self, attn, q, k, v):
+        if k.shape[-2] == 3072:
+            topk = 1024
+        elif k.shape[-2] == 512:
+            topk = 256
+        else:
+            topk = k.shape[-2] // 3
+        if self.topk is True:
+            q1 = q[:, :, ::100, :]
+            sim = q1 @ k.transpose(-1, -2)
+            sim = torch.mean(sim, -2)
+            topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1)
+            topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1])
+            v0 = torch.gather(v, dim=-2, index=topk_ind)
+            k0 = torch.gather(k, dim=-2, index=topk_ind)
+            out = scaled_dot_product_attention(q, k0, v0)
+        elif self.topk is False:
+            out = scaled_dot_product_attention(q, k, v)
+        else:
+            idx, counts = self.topk
+            start = 0
+            outs = []
+            for grid_coord, count in zip(idx, counts):
+                end = start + count
+                q_chunk = q[:, :, start:end, :]
+                q1 = q_chunk[:, :, ::30, :]
+                sim = q1 @ k.transpose(-1, -2)
+                # sim = sim.to(torch.float32)
+                sim = sim.softmax(-1)
+                sim = torch.mean(sim, 1)
+                activated_token = torch.where(sim > 1e-6)[2]
+                index = torch.unique(activated_token, return_counts=True)[0].unsqueeze(0).unsqueeze(0).unsqueeze(-1)
+                index = index.expand(-1, v.shape[1], -1, v.shape[-1])
+                v0 = torch.gather(v, dim=-2, index=index)
+                k0 = torch.gather(k, dim=-2, index=index)
+                out = scaled_dot_product_attention(q_chunk, k0, v0)  # bhnc
+                outs.append(out)
+                start += count
+            out = torch.cat(outs, dim=-2)
+        self.topk = False
+        return out

hy3dgen/shapegen/models/autoencoders/model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import yaml
 from .attention_blocks import FourierEmbedder, Transformer, CrossAttentionDecoder
 from .surface_extractors import MCSurfaceExtractor, SurfaceExtractors
-from .volume_decoders import VanillaVolumeDecoder
 from ...utils import logger, synchronize_timer
@@ -117,6 +117,25 @@ class VectsetVAE(nn.Module):
             outputs = self.surface_extractor(grid_logits, **kwargs)
         return outputs
 class ShapeVAE(VectsetVAE):
     def __init__(

 from .attention_blocks import FourierEmbedder, Transformer, CrossAttentionDecoder
 from .surface_extractors import MCSurfaceExtractor, SurfaceExtractors
+from .volume_decoders import VanillaVolumeDecoder, FlashVDMVolumeDecoding, HierarchicalVolumeDecoding
 from ...utils import logger, synchronize_timer
             outputs = self.surface_extractor(grid_logits, **kwargs)
         return outputs
+    def enable_flashvdm_decoder(
+        self,
+        enabled: bool = True,
+        adaptive_kv_selection=True,
+        topk_mode='mean',
+        mc_algo='dmc',
+    ):
+        if enabled:
+            if adaptive_kv_selection:
+                self.volume_decoder = FlashVDMVolumeDecoding(topk_mode)
+            else:
+                self.volume_decoder = HierarchicalVolumeDecoding()
+            if mc_algo not in SurfaceExtractors.keys():
+                raise ValueError(f'Unsupported mc_algo {mc_algo}, available: {list(SurfaceExtractors.keys())}')
+            self.surface_extractor = SurfaceExtractors[mc_algo]()
+        else:
+            self.volume_decoder = VanillaVolumeDecoder()
+            self.surface_extractor = MCSurfaceExtractor()
 class ShapeVAE(VectsetVAE):
     def __init__(

hy3dgen/shapegen/models/autoencoders/volume_decoders.py CHANGED Viewed

@@ -8,9 +8,111 @@ from einops import repeat
 from tqdm import tqdm
 from .attention_blocks import CrossAttentionDecoder
 from ...utils import logger
 def generate_dense_grid_points(
     bbox_min: np.ndarray,
     bbox_max: np.ndarray,
@@ -74,3 +176,254 @@ class VanillaVolumeDecoder:
         return grid_logits

 from tqdm import tqdm
 from .attention_blocks import CrossAttentionDecoder
+from .attention_processors import FlashVDMCrossAttentionProcessor, FlashVDMTopMCrossAttentionProcessor
 from ...utils import logger
+def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float):
+    """
+    修复维度问题的PyTorch实现
+    Args:
+        input_tensor: shape [D, D, D], torch.float16
+        alpha: 标量偏移值
+    Returns:
+        mask: shape [D, D, D], torch.int32 表面掩码
+    """
+    device = input_tensor.device
+    D = input_tensor.shape[0]
+    signed_val = 0.0
+    # 添加偏移并处理无效值
+    val = input_tensor + alpha
+    valid_mask = val > -9000  # 假设-9000是无效值
+    # 改进的邻居获取函数（保持维度一致）
+    def get_neighbor(t, shift, axis):
+        """根据指定轴进行位移并保持维度一致"""
+        if shift == 0:
+            return t.clone()
+        # 确定填充轴（输入为[D, D, D]对应z,y,x轴）
+        pad_dims = [0, 0, 0, 0, 0, 0]  # 格式：[x前，x后，y前，y后，z前，z后]
+        # 根据轴类型设置填充
+        if axis == 0:  # x轴（最后一个维度）
+            pad_idx = 0 if shift > 0 else 1
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 1:  # y轴（中间维度）
+            pad_idx = 2 if shift > 0 else 3
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 2:  # z轴（第一个维度）
+            pad_idx = 4 if shift > 0 else 5
+            pad_dims[pad_idx] = abs(shift)
+        # 执行填充（添加batch和channel维度适配F.pad）
+        padded = F.pad(t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode='replicate')  # 反转顺序适配F.pad
+        # 构建动态切片索引
+        slice_dims = [slice(None)] * 3  # 初始化为全切片
+        if axis == 0:  # x轴（dim=2）
+            if shift > 0:
+                slice_dims[0] = slice(shift, None)
+            else:
+                slice_dims[0] = slice(None, shift)
+        elif axis == 1:  # y轴（dim=1）
+            if shift > 0:
+                slice_dims[1] = slice(shift, None)
+            else:
+                slice_dims[1] = slice(None, shift)
+        elif axis == 2:  # z轴（dim=0）
+            if shift > 0:
+                slice_dims[2] = slice(shift, None)
+            else:
+                slice_dims[2] = slice(None, shift)
+        # 应用切片并恢复维度
+        padded = padded.squeeze(0).squeeze(0)
+        sliced = padded[slice_dims]
+        return sliced
+    # 获取各方向邻居（确保维度一致）
+    left = get_neighbor(val, 1, axis=0)  # x方向
+    right = get_neighbor(val, -1, axis=0)
+    back = get_neighbor(val, 1, axis=1)  # y方向
+    front = get_neighbor(val, -1, axis=1)
+    down = get_neighbor(val, 1, axis=2)  # z方向
+    up = get_neighbor(val, -1, axis=2)
+    # 处理边界无效值（使用where保持维度一致）
+    def safe_where(neighbor):
+        return torch.where(neighbor > -9000, neighbor, val)
+    left = safe_where(left)
+    right = safe_where(right)
+    back = safe_where(back)
+    front = safe_where(front)
+    down = safe_where(down)
+    up = safe_where(up)
+    # 计算符号一致性（转换为float32确保精度）
+    sign = torch.sign(val.to(torch.float32))
+    neighbors_sign = torch.stack([
+        torch.sign(left.to(torch.float32)),
+        torch.sign(right.to(torch.float32)),
+        torch.sign(back.to(torch.float32)),
+        torch.sign(front.to(torch.float32)),
+        torch.sign(down.to(torch.float32)),
+        torch.sign(up.to(torch.float32))
+    ], dim=0)
+    # 检查所有符号是否一致
+    same_sign = torch.all(neighbors_sign == sign, dim=0)
+    # 生成最终掩码
+    mask = (~same_sign).to(torch.int32)
+    return mask * valid_mask.to(torch.int32)
 def generate_dense_grid_points(
     bbox_min: np.ndarray,
     bbox_max: np.ndarray,
         return grid_logits
+class HierarchicalVolumeDecoding:
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: Callable,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        mc_level: float = 0.0,
+        octree_resolution: int = None,
+        min_resolution: int = 63,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        device = latents.device
+        dtype = latents.dtype
+        resolutions = []
+        if octree_resolution < min_resolution:
+            resolutions.append(octree_resolution)
+        while octree_resolution >= min_resolution:
+            resolutions.append(octree_resolution)
+            octree_resolution = octree_resolution // 2
+        resolutions.reverse()
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min = np.array(bounds[0:3])
+        bbox_max = np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=resolutions[0],
+            indexing="ij"
+        )
+        dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
+        dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device))
+        grid_size = np.array(grid_size)
+        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
+        # 2. latents to 3d volume
+        batch_logits = []
+        batch_size = latents.shape[0]
+        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
+                          desc=f"Hierarchical Volume Decoding [r{resolutions[0] + 1}]"):
+            queries = xyz_samples[start: start + num_chunks, :]
+            batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
+            logits = geo_decoder(queries=batch_queries, latents=latents)
+            batch_logits.append(logits)
+        grid_logits = torch.cat(batch_logits, dim=1).view((batch_size, grid_size[0], grid_size[1], grid_size[2]))
+        for octree_depth_now in resolutions[1:]:
+            grid_size = np.array([octree_depth_now + 1] * 3)
+            resolution = bbox_size / octree_depth_now
+            next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
+            next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device)
+            curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level)
+            curr_points += grid_logits.squeeze(0).abs() < 0.95
+            if octree_depth_now == resolutions[-1]:
+                expand_num = 0
+            else:
+                expand_num = 1
+            for i in range(expand_num):
+                curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+            (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
+            next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+            for i in range(2 - expand_num):
+                next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
+            nidx = torch.where(next_index > 0)
+            next_points = torch.stack(nidx, dim=1)
+            next_points = (next_points * torch.tensor(resolution, dtype=next_points.dtype, device=device) +
+                           torch.tensor(bbox_min, dtype=next_points.dtype, device=device))
+            batch_logits = []
+            for start in tqdm(range(0, next_points.shape[0], num_chunks),
+                              desc=f"Hierarchical Volume Decoding [r{octree_depth_now + 1}]"):
+                queries = next_points[start: start + num_chunks, :]
+                batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
+                logits = geo_decoder(queries=batch_queries.to(latents.dtype), latents=latents)
+                batch_logits.append(logits)
+            grid_logits = torch.cat(batch_logits, dim=1)
+            next_logits[nidx] = grid_logits[0, ..., 0]
+            grid_logits = next_logits.unsqueeze(0)
+        grid_logits[grid_logits == -10000.] = float('nan')
+        return grid_logits
+class FlashVDMVolumeDecoding:
+    def __init__(self, topk_mode='mean'):
+        if topk_mode not in ['mean', 'merge']:
+            raise ValueError(f'Unsupported topk_mode {topk_mode}, available: {["mean", "merge"]}')
+        if topk_mode == 'mean':
+            self.processor = FlashVDMCrossAttentionProcessor()
+        else:
+            self.processor = FlashVDMTopMCrossAttentionProcessor()
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: CrossAttentionDecoder,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        mc_level: float = 0.0,
+        octree_resolution: int = None,
+        min_resolution: int = 63,
+        mini_grid_num: int = 4,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        processor = self.processor
+        geo_decoder.set_cross_attention_processor(processor)
+        device = latents.device
+        dtype = latents.dtype
+        resolutions = []
+        if octree_resolution < min_resolution:
+            resolutions.append(octree_resolution)
+        while octree_resolution >= min_resolution:
+            resolutions.append(octree_resolution)
+            octree_resolution = octree_resolution // 2
+        resolutions.reverse()
+        resolutions[0] = round(resolutions[0] / mini_grid_num) * mini_grid_num - 1
+        for i, resolution in enumerate(resolutions[1:]):
+            resolutions[i + 1] = resolutions[0] * 2 ** (i + 1)
+        logger.info(f"FlashVDMVolumeDecoding Resolution: {resolutions}")
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min = np.array(bounds[0:3])
+        bbox_max = np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=resolutions[0],
+            indexing="ij"
+        )
+        dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
+        dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device))
+        grid_size = np.array(grid_size)
+        # 2. latents to 3d volume
+        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype)
+        batch_size = latents.shape[0]
+        mini_grid_size = xyz_samples.shape[0] // mini_grid_num
+        xyz_samples = xyz_samples.view(
+            mini_grid_num, mini_grid_size,
+            mini_grid_num, mini_grid_size,
+            mini_grid_num, mini_grid_size, 3
+        ).permute(
+            0, 2, 4, 1, 3, 5, 6
+        ).reshape(
+            -1, mini_grid_size * mini_grid_size * mini_grid_size, 3
+        )
+        batch_logits = []
+        num_batchs = max(num_chunks // xyz_samples.shape[1], 1)
+        for start in tqdm(range(0, xyz_samples.shape[0], num_batchs),
+                          desc=f"FlashVDM Volume Decoding", disable=not enable_pbar):
+            queries = xyz_samples[start: start + num_batchs, :]
+            batch = queries.shape[0]
+            batch_latents = repeat(latents.squeeze(0), "p c -> b p c", b=batch)
+            processor.topk = True
+            logits = geo_decoder(queries=queries, latents=batch_latents)
+            batch_logits.append(logits)
+        grid_logits = torch.cat(batch_logits, dim=0).reshape(
+            mini_grid_num, mini_grid_num, mini_grid_num,
+            mini_grid_size, mini_grid_size,
+            mini_grid_size
+        ).permute(0, 3, 1, 4, 2, 5).contiguous().view(
+            (batch_size, grid_size[0], grid_size[1], grid_size[2])
+        )
+        for octree_depth_now in resolutions[1:]:
+            grid_size = np.array([octree_depth_now + 1] * 3)
+            resolution = bbox_size / octree_depth_now
+            next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
+            next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device)
+            curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level)
+            curr_points += grid_logits.squeeze(0).abs() < 0.95
+            if octree_depth_now == resolutions[-1]:
+                expand_num = 0
+            else:
+                expand_num = 1
+            for i in range(expand_num):
+                curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+            (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
+            next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+            for i in range(2 - expand_num):
+                next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
+            nidx = torch.where(next_index > 0)
+            next_points = torch.stack(nidx, dim=1)
+            next_points = (next_points * torch.tensor(resolution, dtype=torch.float32, device=device) +
+                           torch.tensor(bbox_min, dtype=torch.float32, device=device))
+            query_grid_num = 6
+            min_val = next_points.min(axis=0).values
+            max_val = next_points.max(axis=0).values
+            vol_queries_index = (next_points - min_val) / (max_val - min_val) * (query_grid_num - 0.001)
+            index = torch.floor(vol_queries_index).long()
+            index = index[..., 0] * (query_grid_num ** 2) + index[..., 1] * query_grid_num + index[..., 2]
+            index = index.sort()
+            next_points = next_points[index.indices].unsqueeze(0).contiguous()
+            unique_values = torch.unique(index.values, return_counts=True)
+            grid_logits = torch.zeros((next_points.shape[1]), dtype=latents.dtype, device=latents.device)
+            input_grid = [[], []]
+            logits_grid_list = []
+            start_num = 0
+            sum_num = 0
+            for grid_index, count in zip(unique_values[0].cpu().tolist(), unique_values[1].cpu().tolist()):
+                if sum_num + count < num_chunks or sum_num == 0:
+                    sum_num += count
+                    input_grid[0].append(grid_index)
+                    input_grid[1].append(count)
+                else:
+                    processor.topk = input_grid
+                    logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents)
+                    start_num = start_num + sum_num
+                    logits_grid_list.append(logits_grid)
+                    input_grid = [[grid_index], [count]]
+                    sum_num = count
+            if sum_num > 0:
+                processor.topk = input_grid
+                logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents)
+                logits_grid_list.append(logits_grid)
+            logits_grid = torch.cat(logits_grid_list, dim=1)
+            grid_logits[index.indices] = logits_grid.squeeze(0).squeeze(-1)
+            next_logits[nidx] = grid_logits
+            grid_logits = next_logits.unsqueeze(0)
+        grid_logits[grid_logits == -10000.] = float('nan')
+        return grid_logits

hy3dgen/shapegen/models/conditioner.py CHANGED Viewed

@@ -22,7 +22,6 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import numpy as np
 import torch
 import torch.nn as nn
 from torchvision import transforms
@@ -34,26 +33,6 @@ from transformers import (
 )
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (M,)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000 ** omega  # (D/2,)
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-    return np.concatenate([emb_sin, emb_cos], axis=1)
 class ImageEncoder(nn.Module):
     def __init__(
         self,
@@ -88,7 +67,7 @@ class ImageEncoder(nn.Module):
             ]
         )
-    def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
         if value_range is not None:
             low, high = value_range
             image = (image - low) / (high - low)
@@ -103,7 +82,7 @@ class ImageEncoder(nn.Module):
         return last_hidden_state
-    def unconditional_embedding(self, batch_size, **kwargs):
         device = next(self.model.parameters()).device
         dtype = next(self.model.parameters()).dtype
         zero = torch.zeros(
@@ -131,82 +110,11 @@ class DinoImageEncoder(ImageEncoder):
     std = [0.229, 0.224, 0.225]
-class DinoImageEncoderMV(DinoImageEncoder):
-    def __init__(
-        self,
-        version=None,
-        config=None,
-        use_cls_token=True,
-        image_size=224,
-        view_num=4,
-        **kwargs,
-    ):
-        super().__init__(version, config, use_cls_token, image_size, **kwargs)
-        self.view_num = view_num
-        self.num_patches = self.num_patches
-        pos = np.arange(self.view_num, dtype=np.float32)
-        view_embedding = torch.from_numpy(
-            get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
-        view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
-        self.view_embed = view_embedding.unsqueeze(0)
-    def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
-        if value_range is not None:
-            low, high = value_range
-            image = (image - low) / (high - low)
-        image = image.to(self.model.device, dtype=self.model.dtype)
-        bs, num_views, c, h, w = image.shape
-        image = image.view(bs * num_views, c, h, w)
-        inputs = self.transform(image)
-        outputs = self.model(inputs)
-        last_hidden_state = outputs.last_hidden_state
-        last_hidden_state = last_hidden_state.view(
-            bs, num_views, last_hidden_state.shape[-2],
-            last_hidden_state.shape[-1]
-        )
-        view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
-        if view_idxs is not None:
-            assert len(view_idxs) == bs
-            view_embeddings = []
-            for i in range(bs):
-                view_idx = view_idxs[i]
-                assert num_views == len(view_idx)
-                view_embeddings.append(self.view_embed[:, view_idx, ...])
-            view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
-        if num_views != self.view_num:
-            view_embedding = view_embedding[:, :num_views, ...]
-        last_hidden_state = last_hidden_state + view_embedding
-        last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
-                                                   last_hidden_state.shape[-1])
-        return last_hidden_state
-    def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
-        device = next(self.model.parameters()).device
-        dtype = next(self.model.parameters()).dtype
-        zero = torch.zeros(
-            batch_size,
-            self.num_patches * len(view_idxs[0]),
-            self.model.config.hidden_size,
-            device=device,
-            dtype=dtype,
-        )
-        return zero
 def build_image_encoder(config):
     if config['type'] == 'CLIPImageEncoder':
         return CLIPImageEncoder(**config['kwargs'])
     elif config['type'] == 'DinoImageEncoder':
         return DinoImageEncoder(**config['kwargs'])
-    elif config['type'] == 'DinoImageEncoderMV':
-        return DinoImageEncoderMV(**config['kwargs'])
     else:
         raise ValueError(f'Unknown image encoder type: {config["type"]}')
@@ -221,17 +129,17 @@ class DualImageEncoder(nn.Module):
         self.main_image_encoder = build_image_encoder(main_image_encoder)
         self.additional_image_encoder = build_image_encoder(additional_image_encoder)
-    def forward(self, image, mask=None, **kwargs):
         outputs = {
-            'main': self.main_image_encoder(image, mask=mask, **kwargs),
-            'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
         }
         return outputs
-    def unconditional_embedding(self, batch_size, **kwargs):
         outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
-            'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
         }
         return outputs
@@ -244,14 +152,14 @@ class SingleImageEncoder(nn.Module):
         super().__init__()
         self.main_image_encoder = build_image_encoder(main_image_encoder)
-    def forward(self, image, mask=None, **kwargs):
         outputs = {
-            'main': self.main_image_encoder(image, mask=mask, **kwargs),
         }
         return outputs
-    def unconditional_embedding(self, batch_size, **kwargs):
         outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
         }
         return outputs

 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import torch
 import torch.nn as nn
 from torchvision import transforms
 )
 class ImageEncoder(nn.Module):
     def __init__(
         self,
             ]
         )
+    def forward(self, image, mask=None, value_range=(-1, 1)):
         if value_range is not None:
             low, high = value_range
             image = (image - low) / (high - low)
         return last_hidden_state
+    def unconditional_embedding(self, batch_size):
         device = next(self.model.parameters()).device
         dtype = next(self.model.parameters()).dtype
         zero = torch.zeros(
     std = [0.229, 0.224, 0.225]
 def build_image_encoder(config):
     if config['type'] == 'CLIPImageEncoder':
         return CLIPImageEncoder(**config['kwargs'])
     elif config['type'] == 'DinoImageEncoder':
         return DinoImageEncoder(**config['kwargs'])
     else:
         raise ValueError(f'Unknown image encoder type: {config["type"]}')
         self.main_image_encoder = build_image_encoder(main_image_encoder)
         self.additional_image_encoder = build_image_encoder(additional_image_encoder)
+    def forward(self, image, mask=None):
         outputs = {
+            'main': self.main_image_encoder(image, mask=mask),
+            'additional': self.additional_image_encoder(image, mask=mask),
         }
         return outputs
+    def unconditional_embedding(self, batch_size):
         outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size),
+            'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
         }
         return outputs
         super().__init__()
         self.main_image_encoder = build_image_encoder(main_image_encoder)
+    def forward(self, image, mask=None):
         outputs = {
+            'main': self.main_image_encoder(image, mask=mask),
         }
         return outputs
+    def unconditional_embedding(self, batch_size):
         outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size),
         }
         return outputs

hy3dgen/shapegen/models/denoisers/__init__.py CHANGED Viewed

	@@ -1 +1,2 @@
1	from .hunyuan3ddit import Hunyuan3DDiT


1	from .hunyuan3ddit import Hunyuan3DDiT
2	+ from .hunyuandit import HunYuanDiTPlain

hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py CHANGED Viewed

@@ -70,15 +70,6 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     return embedding
-class GELU(nn.Module):
-    def __init__(self, approximate='tanh'):
-        super().__init__()
-        self.approximate = approximate
-    def forward(self, x: Tensor) -> Tensor:
-        return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
@@ -181,7 +172,7 @@ class DoubleStreamBlock(nn.Module):
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
@@ -192,7 +183,7 @@ class DoubleStreamBlock(nn.Module):
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
@@ -258,7 +249,7 @@ class SingleStreamBlock(nn.Module):
         self.hidden_size = hidden_size
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = GELU(approximate="tanh")
         self.modulation = Modulation(hidden_size, double=False)
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:

     return embedding
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.hidden_size = hidden_size
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
         self.modulation = Modulation(hidden_size, double=False)
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:

hy3dgen/shapegen/pipelines.py CHANGED Viewed

@@ -34,12 +34,11 @@ import trimesh
 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
-from .utils import logger, synchronize_timer, smart_load_model
 def retrieve_timesteps(
@@ -138,9 +137,6 @@ def instantiate_from_config(config, **kwargs):
 class Hunyuan3DDiTPipeline:
-    model_cpu_offload_seq = "conditioner->model->vae"
-    _exclude_from_cpu_offload = []
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
@@ -221,12 +217,34 @@ class Hunyuan3DDiTPipeline:
             dtype=dtype,
             device=device,
         )
-        config_path, ckpt_path = smart_load_model(
-            model_path,
-            subfolder=subfolder,
-            use_safetensors=use_safetensors,
-            variant=variant
-        )
         return cls.from_single_file(
             ckpt_path,
             config_path,
@@ -271,18 +289,12 @@ class Hunyuan3DDiTPipeline:
         if enabled:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             turbo_vae_mapping = {
-                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
-                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
-                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
             }
             model_name = model_path.split('/')[-1]
             if replace_vae and model_name in turbo_vae_mapping:
-                model_path, subfolder = turbo_vae_mapping[model_name]
-                self.vae = ShapeVAE.from_pretrained(
-                    model_path, subfolder=subfolder,
-                    use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
-                    device=self.device,
-                )
             self.vae.enable_flashvdm_decoder(
                 enabled=enabled,
                 adaptive_kv_selection=adaptive_kv_selection,
@@ -292,146 +304,33 @@ class Hunyuan3DDiTPipeline:
         else:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             vae_mapping = {
-                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
-                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
-                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
             }
             model_name = model_path.split('/')[-1]
             if model_name in vae_mapping:
-                model_path, subfolder = vae_mapping[model_name]
-                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
             self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
-        if dtype is not None:
-            self.dtype = dtype
-            self.vae.to(dtype=dtype)
-            self.model.to(dtype=dtype)
-            self.conditioner.to(dtype=dtype)
         if device is not None:
             self.device = torch.device(device)
             self.vae.to(device)
             self.model.to(device)
             self.conditioner.to(device)
-    @property
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
-        Accelerate's module hooks.
-        """
-        for name, model in self.components.items():
-            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
-                continue
-            if not hasattr(model, "_hf_hook"):
-                return self.device
-            for module in model.modules():
-                if (
-                    hasattr(module, "_hf_hook")
-                    and hasattr(module._hf_hook, "execution_device")
-                    and module._hf_hook.execution_device is not None
-                ):
-                    return torch.device(module._hf_hook.execution_device)
-        return self.device
-    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        Arguments:
-            gpu_id (`int`, *optional*):
-                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
-            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
-                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
-                default to "cuda".
-        """
-        if self.model_cpu_offload_seq is None:
-            raise ValueError(
-                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
-            )
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        torch_device = torch.device(device)
-        device_index = torch_device.index
-        if gpu_id is not None and device_index is not None:
-            raise ValueError(
-                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
-                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
-            )
-        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
-        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
-        device_type = torch_device.type
-        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu")
-            device_mod = getattr(torch, self.device.type, None)
-            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
-                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
-        self._all_hooks = []
-        hook = None
-        for model_str in self.model_cpu_offload_seq.split("->"):
-            model = all_model_components.pop(model_str, None)
-            if not isinstance(model, torch.nn.Module):
-                continue
-            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
-            self._all_hooks.append(hook)
-        # CPU offload models that are not in the seq chain unless they are explicitly excluded
-        # these models will stay on CPU until maybe_free_model_hooks is called
-        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
-        for name, model in all_model_components.items():
-            if not isinstance(model, torch.nn.Module):
-                continue
-            if name in self._exclude_from_cpu_offload:
-                model.to(device)
-            else:
-                _, hook = cpu_offload_with_hook(model, device)
-                self._all_hooks.append(hook)
-    def maybe_free_model_hooks(self):
-        r"""
-        Function that offloads all components, removes all model hooks that were added when using
-        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
-        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
-        functions correctly when applying enable_model_cpu_offload.
-        """
-        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
-            # `enable_model_cpu_offload` has not be called, so silently do nothing
-            return
-        for hook in self._all_hooks:
-            # offload model and remove hook from model
-            hook.offload()
-            hook.remove()
-        # make sure the model is in the same state as before calling it
-        self.enable_model_cpu_offload()
     @synchronize_timer('Encode cond')
-    def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
         bsz = image.shape[0]
-        cond = self.conditioner(image=image, **additional_cond_inputs)
         if do_classifier_free_guidance:
-            un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
             if dual_guidance:
                 un_cond_drop_main = copy.deepcopy(un_cond)
@@ -447,7 +346,7 @@ class Hunyuan3DDiTPipeline:
                 cond = cat_recursive(cond, un_cond_drop_main, un_cond)
             else:
-                un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
                 def cat_recursive(a, b):
                     if isinstance(a, torch.Tensor):
@@ -494,27 +393,25 @@ class Hunyuan3DDiTPipeline:
         latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         return latents
-    def prepare_image(self, image) -> dict:
         if isinstance(image, str) and not os.path.exists(image):
             raise FileNotFoundError(f"Couldn't find image at path {image}")
         if not isinstance(image, list):
             image = [image]
-        outputs = []
         for img in image:
-            output = self.image_processor(img)
-            outputs.append(output)
-        cond_input = {k: [] for k in outputs[0].keys()}
-        for output in outputs:
-            for key, value in output.items():
-                cond_input[key].append(value)
-        for key, value in cond_input.items():
-            if isinstance(value[0], torch.Tensor):
-                cond_input[key] = torch.cat(value, dim=0)
-        return cond_input
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
@@ -589,6 +486,7 @@ class Hunyuan3DDiTPipeline:
         image, mask = self.prepare_image(image)
         cond = self.encode_cond(image=image,
                                 do_classifier_free_guidance=do_classifier_free_guidance,
                                 dual_guidance=dual_guidance)
         batch_size = image.shape[0]
@@ -648,17 +546,7 @@ class Hunyuan3DDiTPipeline:
             box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         )
-    def _export(
-        self,
-        latents,
-        output_type='trimesh',
-        box_v=1.01,
-        mc_level=0.0,
-        num_chunks=20000,
-        octree_resolution=256,
-        mc_algo='mc',
-        enable_pbar=True
-    ):
         if not output_type == "latent":
             latents = 1. / self.vae.scale_factor * latents
             latents = self.vae(latents)
@@ -685,7 +573,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
     @torch.inference_mode()
     def __call__(
         self,
-        image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         sigmas: List[float] = None,
@@ -713,11 +601,10 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
             self.model.guidance_embed is True
         )
-        cond_inputs = self.prepare_image(image)
-        image = cond_inputs.pop('image')
         cond = self.encode_cond(
             image=image,
-            additional_cond_inputs=cond_inputs,
             do_classifier_free_guidance=do_classifier_free_guidance,
             dual_guidance=False,
         )

 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
+from .utils import logger, synchronize_timer
 def retrieve_timesteps(
 class Hunyuan3DDiTPipeline:
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
             dtype=dtype,
             device=device,
         )
+        original_model_path = model_path
+        # try local path
+        base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
+        model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
+        logger.info(f'Try to load model from local path: {model_path}')
+        if not os.path.exists(model_path):
+            logger.info('Model path not exists, try to download from huggingface')
+            try:
+                import huggingface_hub
+                # download from huggingface
+                path = huggingface_hub.snapshot_download(repo_id=original_model_path)
+                model_path = os.path.join(path, subfolder)
+            except ImportError:
+                logger.warning(
+                    "You need to install HuggingFace Hub to load models from the hub."
+                )
+                raise RuntimeError(f"Model path {model_path} not found")
+            except Exception as e:
+                raise e
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model path {original_model_path} not found")
+        extension = 'ckpt' if not use_safetensors else 'safetensors'
+        variant = '' if variant is None else f'.{variant}'
+        ckpt_name = f'model{variant}.{extension}'
+        config_path = os.path.join(model_path, 'config.yaml')
+        ckpt_path = os.path.join(model_path, ckpt_name)
         return cls.from_single_file(
             ckpt_path,
             config_path,
         if enabled:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             turbo_vae_mapping = {
+                'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
+                'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s-turbo'
             }
             model_name = model_path.split('/')[-1]
             if replace_vae and model_name in turbo_vae_mapping:
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=turbo_vae_mapping[model_name])
             self.vae.enable_flashvdm_decoder(
                 enabled=enabled,
                 adaptive_kv_selection=adaptive_kv_selection,
         else:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             vae_mapping = {
+                'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
+                'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s'
             }
             model_name = model_path.split('/')[-1]
             if model_name in vae_mapping:
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=vae_mapping[model_name])
             self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
         if device is not None:
             self.device = torch.device(device)
             self.vae.to(device)
             self.model.to(device)
             self.conditioner.to(device)
+        if dtype is not None:
+            self.dtype = dtype
+            self.vae.to(dtype=dtype)
+            self.model.to(dtype=dtype)
+            self.conditioner.to(dtype=dtype)
     @synchronize_timer('Encode cond')
+    def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
         bsz = image.shape[0]
+        cond = self.conditioner(image=image, mask=mask)
         if do_classifier_free_guidance:
+            un_cond = self.conditioner.unconditional_embedding(bsz)
             if dual_guidance:
                 un_cond_drop_main = copy.deepcopy(un_cond)
                 cond = cat_recursive(cond, un_cond_drop_main, un_cond)
             else:
+                un_cond = self.conditioner.unconditional_embedding(bsz)
                 def cat_recursive(a, b):
                     if isinstance(a, torch.Tensor):
         latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         return latents
+    def prepare_image(self, image):
         if isinstance(image, str) and not os.path.exists(image):
             raise FileNotFoundError(f"Couldn't find image at path {image}")
         if not isinstance(image, list):
             image = [image]
+        image_pts = []
+        mask_pts = []
         for img in image:
+            image_pt, mask_pt = self.image_processor(img, return_mask=True)
+            image_pts.append(image_pt)
+            mask_pts.append(mask_pt)
+        image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
+        if mask_pts[0] is not None:
+            mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
+        else:
+            mask_pts = None
+        return image_pts, mask_pts
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
         image, mask = self.prepare_image(image)
         cond = self.encode_cond(image=image,
+                                mask=mask,
                                 do_classifier_free_guidance=do_classifier_free_guidance,
                                 dual_guidance=dual_guidance)
         batch_size = image.shape[0]
             box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         )
+    def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=True):
         if not output_type == "latent":
             latents = 1. / self.vae.scale_factor * latents
             latents = self.vae(latents)
     @torch.inference_mode()
     def __call__(
         self,
+        image: Union[str, List[str], Image.Image] = None,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         sigmas: List[float] = None,
             self.model.guidance_embed is True
         )
+        image, mask = self.prepare_image(image)
         cond = self.encode_cond(
             image=image,
+            mask=mask,
             do_classifier_free_guidance=do_classifier_free_guidance,
             dual_guidance=False,
         )

hy3dgen/shapegen/postprocessors.py CHANGED Viewed

@@ -22,16 +22,13 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import os
 import tempfile
 from typing import Union
-import numpy as np
 import pymeshlab
-import torch
 import trimesh
-from .models.autoencoders import Latent2MeshOutput
 from .utils import synchronize_timer
@@ -165,62 +162,3 @@ class DegenerateFaceRemover:
         mesh = export_mesh(mesh, ms)
         return mesh
-def import_pymeshlab_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
-    if isinstance(mesh, str):
-        mesh = load_mesh(mesh)
-    elif isinstance(mesh, Latent2MeshOutput):
-        mesh = pymeshlab.MeshSet()
-        mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
-        mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
-    if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
-        mesh = trimesh2pymeshlab(mesh)
-    return mesh
-def mesh_normalize(mesh):
-    """
-    Normalize mesh vertices to sphere
-    """
-    scale_factor = 1.2
-    vtx_pos = np.asarray(mesh.vertices)
-    max_bb = (vtx_pos - 0).max(0)[0]
-    min_bb = (vtx_pos - 0).min(0)[0]
-    center = (max_bb + min_bb) / 2
-    scale = torch.norm(torch.tensor(vtx_pos - center, dtype=torch.float32), dim=1).max() * 2.0
-    vtx_pos = (vtx_pos - center) * (scale_factor / float(scale))
-    mesh.vertices = vtx_pos
-    return mesh
-class MeshSimplifier:
-    def __init__(self, executable: str = None):
-        if executable is None:
-            CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-            executable = os.path.join(CURRENT_DIR, "mesh_simplifier.bin")
-        self.executable = executable
-    @synchronize_timer('MeshSimplifier')
-    def __call__(
-        self,
-        mesh: Union[trimesh.Trimesh],
-    ) -> Union[trimesh.Trimesh]:
-        with tempfile.NamedTemporaryFile(suffix='.obj', delete=True) as temp_input:
-            with tempfile.NamedTemporaryFile(suffix='.obj', delete=True) as temp_output:
-                mesh.export(temp_input.name)
-                os.system(f'{self.executable} {temp_input.name} {temp_output.name}')
-                ms = trimesh.load(temp_output.name, process=False)
-                if isinstance(ms, trimesh.Scene):
-                    combined_mesh = trimesh.Trimesh()
-                    for geom in ms.geometry.values():
-                        combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
-                    ms = combined_mesh
-                ms = mesh_normalize(ms)
-                return ms

 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import tempfile
 from typing import Union
 import pymeshlab
 import trimesh
+from .models.vae import Latent2MeshOutput
 from .utils import synchronize_timer
         mesh = export_mesh(mesh, ms)
         return mesh

hy3dgen/shapegen/preprocessors.py CHANGED Viewed

@@ -96,7 +96,7 @@ class ImageProcessorV2:
         mask = mask.clip(0, 255).astype(np.uint8)
         return result, mask
-    def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
         if self.border_ratio is not None:
             border_ratio = self.border_ratio
         if isinstance(image, str):
@@ -115,74 +115,13 @@ class ImageProcessorV2:
         if to_tensor:
             image = array_to_tensor(image)
             mask = array_to_tensor(mask)
-        outputs = {
-            'image': image,
-            'mask': mask
-        }
-        return outputs
-class MVImageProcessorV2(ImageProcessorV2):
-    """
-    view order: front, front clockwise 90, back, front clockwise 270
-    """
-    return_view_idx = True
-    def __init__(self, size=512, border_ratio=None):
-        super().__init__(size, border_ratio)
-        self.view2idx = {
-            'front': 0,
-            'left': 1,
-            'back': 2,
-            'right': 3
-        }
-    def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
-        if self.border_ratio is not None:
-            border_ratio = self.border_ratio
-        images = []
-        masks = []
-        view_idxs = []
-        for idx, (view_tag, image) in enumerate(image_dict.items()):
-            view_idxs.append(self.view2idx[view_tag])
-            if isinstance(image, str):
-                image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
-                image, mask = self.recenter(image, border_ratio=border_ratio)
-                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            elif isinstance(image, Image.Image):
-                image = image.convert("RGBA")
-                image = np.asarray(image)
-                image, mask = self.recenter(image, border_ratio=border_ratio)
-            image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-            mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
-            mask = mask[..., np.newaxis]
-            if to_tensor:
-                image = array_to_tensor(image)
-                mask = array_to_tensor(mask)
-            images.append(image)
-            masks.append(mask)
-        zipped_lists = zip(view_idxs, images, masks)
-        sorted_zipped_lists = sorted(zipped_lists)
-        view_idxs, images, masks = zip(*sorted_zipped_lists)
-        image = torch.cat(images, 0).unsqueeze(0)
-        mask = torch.cat(masks, 0).unsqueeze(0)
-        outputs = {
-            'image': image,
-            'mask': mask,
-            'view_idxs': view_idxs
-        }
-        return outputs
 IMAGE_PROCESSORS = {
     "v2": ImageProcessorV2,
-    'mv_v2': MVImageProcessorV2,
 }
 DEFAULT_IMAGEPROCESSOR = 'v2'

         mask = mask.clip(0, 255).astype(np.uint8)
         return result, mask
+    def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
         if self.border_ratio is not None:
             border_ratio = self.border_ratio
         if isinstance(image, str):
         if to_tensor:
             image = array_to_tensor(image)
             mask = array_to_tensor(mask)
+        if return_mask:
+            return image, mask
+        return image
 IMAGE_PROCESSORS = {
     "v2": ImageProcessorV2,
 }
 DEFAULT_IMAGEPROCESSOR = 'v2'

hy3dgen/shapegen/utils.py CHANGED Viewed

@@ -70,40 +70,3 @@ class synchronize_timer:
             return result
         return wrapper
-def smart_load_model(
-    model_path,
-    subfolder,
-    use_safetensors,
-    variant,
-):
-    original_model_path = model_path
-    # try local path
-    base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
-    model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
-    logger.info(f'Try to load model from local path: {model_path}')
-    if not os.path.exists(model_path):
-        logger.info('Model path not exists, try to download from huggingface')
-        try:
-            import huggingface_hub
-            # download from huggingface
-            path = huggingface_hub.snapshot_download(repo_id=original_model_path)
-            model_path = os.path.join(path, subfolder)
-        except ImportError:
-            logger.warning(
-                "You need to install HuggingFace Hub to load models from the hub."
-            )
-            raise RuntimeError(f"Model path {model_path} not found")
-        except Exception as e:
-            raise e
-    if not os.path.exists(model_path):
-        raise FileNotFoundError(f"Model path {original_model_path} not found")
-    extension = 'ckpt' if not use_safetensors else 'safetensors'
-    variant = '' if variant is None else f'.{variant}'
-    ckpt_name = f'model{variant}.{extension}'
-    config_path = os.path.join(model_path, 'config.yaml')
-    ckpt_path = os.path.join(model_path, ckpt_name)
-    return config_path, ckpt_path


70	return result
71
72	return wrapper