Spaces:
Running
on
Zero
Running
on
Zero
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT | |
# except for the third-party components listed below. | |
# Hunyuan 3D does not impose any additional limitations beyond what is outlined | |
# in the repsective licenses of these third-party components. | |
# Users must comply with all terms and conditions of original licenses of these third-party | |
# components and must ensure that the usage of the third party components adheres to | |
# all relevant laws and regulations. | |
# For avoidance of doubts, Hunyuan 3D means the large language models and | |
# their software and algorithms, including trained model weights, parameters (including | |
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, | |
# fine-tuning enabling code and other elements of the foregoing made publicly available | |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. | |
import os | |
import torch | |
import torch.nn as nn | |
import yaml | |
from .attention_blocks import FourierEmbedder, Transformer, CrossAttentionDecoder | |
from .surface_extractors import MCSurfaceExtractor, SurfaceExtractors | |
from .volume_decoders import VanillaVolumeDecoder, FlashVDMVolumeDecoding, HierarchicalVolumeDecoding | |
from ...utils import logger, synchronize_timer, smart_load_model | |
class VectsetVAE(nn.Module): | |
def from_single_file( | |
cls, | |
ckpt_path, | |
config_path, | |
device='cuda', | |
dtype=torch.float16, | |
use_safetensors=None, | |
**kwargs, | |
): | |
# load config | |
with open(config_path, 'r') as f: | |
config = yaml.safe_load(f) | |
# load ckpt | |
if use_safetensors: | |
ckpt_path = ckpt_path.replace('.ckpt', '.safetensors') | |
if not os.path.exists(ckpt_path): | |
raise FileNotFoundError(f"Model file {ckpt_path} not found") | |
logger.info(f"Loading model from {ckpt_path}") | |
if use_safetensors: | |
import safetensors.torch | |
ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') | |
else: | |
ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) | |
model_kwargs = config['params'] | |
model_kwargs.update(kwargs) | |
model = cls(**model_kwargs) | |
model.load_state_dict(ckpt) | |
model.to(device=device, dtype=dtype) | |
return model | |
def from_pretrained( | |
cls, | |
model_path, | |
device='cuda', | |
dtype=torch.float16, | |
use_safetensors=True, | |
variant='fp16', | |
subfolder='hunyuan3d-vae-v2-0', | |
**kwargs, | |
): | |
config_path, ckpt_path = smart_load_model( | |
model_path, | |
subfolder=subfolder, | |
use_safetensors=use_safetensors, | |
variant=variant | |
) | |
return cls.from_single_file( | |
ckpt_path, | |
config_path, | |
device=device, | |
dtype=dtype, | |
use_safetensors=use_safetensors, | |
**kwargs | |
) | |
def __init__( | |
self, | |
volume_decoder=None, | |
surface_extractor=None | |
): | |
super().__init__() | |
if volume_decoder is None: | |
volume_decoder = VanillaVolumeDecoder() | |
if surface_extractor is None: | |
surface_extractor = MCSurfaceExtractor() | |
self.volume_decoder = volume_decoder | |
self.surface_extractor = surface_extractor | |
def latents2mesh(self, latents: torch.FloatTensor, **kwargs): | |
with synchronize_timer('Volume decoding'): | |
grid_logits = self.volume_decoder(latents, self.geo_decoder, **kwargs) | |
with synchronize_timer('Surface extraction'): | |
outputs = self.surface_extractor(grid_logits, **kwargs) | |
return outputs | |
def enable_flashvdm_decoder( | |
self, | |
enabled: bool = True, | |
adaptive_kv_selection=True, | |
topk_mode='mean', | |
mc_algo='dmc', | |
): | |
if enabled: | |
if adaptive_kv_selection: | |
self.volume_decoder = FlashVDMVolumeDecoding(topk_mode) | |
else: | |
self.volume_decoder = HierarchicalVolumeDecoding() | |
if mc_algo not in SurfaceExtractors.keys(): | |
raise ValueError(f'Unsupported mc_algo {mc_algo}, available: {list(SurfaceExtractors.keys())}') | |
self.surface_extractor = SurfaceExtractors[mc_algo]() | |
else: | |
self.volume_decoder = VanillaVolumeDecoder() | |
self.surface_extractor = MCSurfaceExtractor() | |
class ShapeVAE(VectsetVAE): | |
def __init__( | |
self, | |
*, | |
num_latents: int, | |
embed_dim: int, | |
width: int, | |
heads: int, | |
num_decoder_layers: int, | |
geo_decoder_downsample_ratio: int = 1, | |
geo_decoder_mlp_expand_ratio: int = 4, | |
geo_decoder_ln_post: bool = True, | |
num_freqs: int = 8, | |
include_pi: bool = True, | |
qkv_bias: bool = True, | |
qk_norm: bool = False, | |
label_type: str = "binary", | |
drop_path_rate: float = 0.0, | |
scale_factor: float = 1.0, | |
): | |
super().__init__() | |
self.geo_decoder_ln_post = geo_decoder_ln_post | |
self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) | |
self.post_kl = nn.Linear(embed_dim, width) | |
self.transformer = Transformer( | |
n_ctx=num_latents, | |
width=width, | |
layers=num_decoder_layers, | |
heads=heads, | |
qkv_bias=qkv_bias, | |
qk_norm=qk_norm, | |
drop_path_rate=drop_path_rate | |
) | |
self.geo_decoder = CrossAttentionDecoder( | |
fourier_embedder=self.fourier_embedder, | |
out_channels=1, | |
num_latents=num_latents, | |
mlp_expand_ratio=geo_decoder_mlp_expand_ratio, | |
downsample_ratio=geo_decoder_downsample_ratio, | |
enable_ln_post=self.geo_decoder_ln_post, | |
width=width // geo_decoder_downsample_ratio, | |
heads=heads // geo_decoder_downsample_ratio, | |
qkv_bias=qkv_bias, | |
qk_norm=qk_norm, | |
label_type=label_type, | |
) | |
self.scale_factor = scale_factor | |
self.latent_shape = (num_latents, embed_dim) | |
def forward(self, latents): | |
latents = self.post_kl(latents) | |
latents = self.transformer(latents) | |
return latents | |