Spaces:
Running
on
L4
Running
on
L4
File size: 12,319 Bytes
81ecb2b fb96ff6 81ecb2b 2c17c74 81ecb2b 692682d 81ecb2b 6cf1b17 81ecb2b 6cf1b17 81ecb2b 02e04ed 81ecb2b 6cf1b17 81ecb2b 692682d 81ecb2b 692682d 81ecb2b e1329d4 81ecb2b 0ad853d 81ecb2b 1936a56 81ecb2b 93bf50d 2c17c74 cb029d0 93bf50d 2c17c74 cb029d0 81ecb2b 80596aa 1936a56 81ecb2b ec9a91e cb029d0 81ecb2b 6cf1b17 2c17c74 1936a56 2c17c74 1936a56 cb029d0 1936a56 81ecb2b 93bf50d 81ecb2b fb96ff6 49ab044 fb96ff6 93bf50d a4912eb 93bf50d a4912eb e3760d0 a4912eb 93bf50d ec9a91e 0ad853d 1936a56 81ecb2b 6cf1b17 81ecb2b 0ad853d 81ecb2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
import os
import imageio
import numpy as np
os.system("bash install.sh")
from omegaconf import OmegaConf
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms.functional as TF
import rembg
import gradio as gr
from gradio_litmodel3d import LitModel3D
from dva.io import load_from_config
from dva.ray_marcher import RayMarcher
from dva.visualize import visualize_primvolume, visualize_video_primvolume
from inference import remove_background, resize_foreground, extract_texmesh
from models.diffusion import create_diffusion
from huggingface_hub import hf_hub_download
ckpt_path = hf_hub_download(repo_id="frozenburning/3DTopia-XL", filename="model_sview_dit_fp16.pt")
vae_ckpt_path = hf_hub_download(repo_id="frozenburning/3DTopia-XL", filename="model_vae_fp16.pt")
GRADIO_PRIM_VIDEO_PATH = 'prim.mp4'
GRADIO_RGB_VIDEO_PATH = 'rgb.mp4'
GRADIO_MAT_VIDEO_PATH = 'mat.mp4'
GRADIO_GLB_PATH = 'pbr_mesh.glb'
CONFIG_PATH = "./configs/inference_dit.yml"
config = OmegaConf.load(CONFIG_PATH)
config.checkpoint_path = ckpt_path
config.model.vae_checkpoint_path = vae_ckpt_path
# model
model = load_from_config(config.model.generator)
state_dict = torch.load(config.checkpoint_path, map_location='cpu')
model.load_state_dict(state_dict['ema'])
vae = load_from_config(config.model.vae)
vae_state_dict = torch.load(config.model.vae_checkpoint_path, map_location='cpu')
vae.load_state_dict(vae_state_dict['model_state_dict'])
conditioner = load_from_config(config.model.conditioner)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vae = vae.to(device)
conditioner = conditioner.to(device)
model = model.to(device)
model.eval()
amp = True
precision_dtype = torch.float16
rm = RayMarcher(
256,
256,
**config.rm,
).to(device)
perchannel_norm = False
if "latent_mean" in config.model:
latent_mean = torch.Tensor(config.model.latent_mean)[None, None, :].to(device)
latent_std = torch.Tensor(config.model.latent_std)[None, None, :].to(device)
assert latent_mean.shape[-1] == config.model.generator.in_channels
perchannel_norm = True
latent_nf = config.model.latent_nf
config.diffusion.pop("timestep_respacing")
config.model.pop("vae")
config.model.pop("vae_checkpoint_path")
config.model.pop("conditioner")
config.model.pop("generator")
config.model.pop("latent_nf")
config.model.pop("latent_mean")
config.model.pop("latent_std")
model_primx = load_from_config(config.model)
# load rembg
rembg_session = rembg.new_session()
# background removal function
def background_remove_process(input_image):
input_image = remove_background(input_image, rembg_session)
input_image = resize_foreground(input_image, 0.85)
input_cond_preview_pil = input_image
raw_image = np.array(input_image)
mask = (raw_image[..., -1][..., None] > 0) * 1
raw_image = raw_image[..., :3] * mask
input_cond = torch.from_numpy(np.array(raw_image)[None, ...]).to(device)
return gr.update(interactive=True), input_cond, input_cond_preview_pil
# process function
def process(input_cond, input_num_steps, input_seed=42, input_cfg=6.0):
# seed
torch.manual_seed(input_seed)
os.makedirs(config.output_dir, exist_ok=True)
output_rgb_video_path = os.path.join(config.output_dir, GRADIO_RGB_VIDEO_PATH)
output_prim_video_path = os.path.join(config.output_dir, GRADIO_PRIM_VIDEO_PATH)
output_mat_video_path = os.path.join(config.output_dir, GRADIO_MAT_VIDEO_PATH)
respacing = "ddim{}".format(input_num_steps)
diffusion = create_diffusion(timestep_respacing=respacing, **config.diffusion)
sample_fn = diffusion.ddim_sample_loop_progressive
fwd_fn = model.forward_with_cfg
# text-conditioned
if input_cond is None:
raise NotImplementedError
with torch.no_grad():
latent = torch.randn(1, config.model.num_prims, 1, 4, 4, 4)
batch = {}
inf_bs = 1
inf_x = torch.randn(inf_bs, config.model.num_prims, 68).to(device)
y = conditioner.encoder(input_cond)
model_kwargs = dict(y=y[:inf_bs, ...], precision_dtype=precision_dtype, enable_amp=amp)
if input_cfg >= 0:
model_kwargs['cfg_scale'] = input_cfg
for samples in sample_fn(fwd_fn, inf_x.shape, inf_x, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device):
final_samples = samples
recon_param = final_samples["sample"].reshape(inf_bs, config.model.num_prims, -1)
if perchannel_norm:
recon_param = recon_param / latent_nf * latent_std + latent_mean
recon_srt_param = recon_param[:, :, 0:4]
recon_feat_param = recon_param[:, :, 4:] # [8, 2048, 64]
recon_feat_param_list = []
# one-by-one to avoid oom
for inf_bidx in range(inf_bs):
if not perchannel_norm:
decoded = vae.decode(recon_feat_param[inf_bidx, ...].reshape(1*config.model.num_prims, *latent.shape[-4:]) / latent_nf)
else:
decoded = vae.decode(recon_feat_param[inf_bidx, ...].reshape(1*config.model.num_prims, *latent.shape[-4:]))
recon_feat_param_list.append(decoded.detach())
recon_feat_param = torch.concat(recon_feat_param_list, dim=0)
# invert normalization
if not perchannel_norm:
recon_srt_param[:, :, 0:1] = (recon_srt_param[:, :, 0:1] / 10) + 0.05
recon_feat_param[:, 0:1, ...] /= 5.
recon_feat_param[:, 1:, ...] = (recon_feat_param[:, 1:, ...] + 1) / 2.
recon_feat_param = recon_feat_param.reshape(inf_bs, config.model.num_prims, -1)
recon_param = torch.concat([recon_srt_param, recon_feat_param], dim=-1)
visualize_video_primvolume(config.output_dir, batch, recon_param, 15, rm, device)
prim_params = {'srt_param': recon_srt_param[0].detach().cpu(), 'feat_param': recon_feat_param[0].detach().cpu()}
return output_rgb_video_path, output_prim_video_path, output_mat_video_path, prim_params
def export_mesh(prim_params, uv_unwrap="Faster", remesh="No", mc_resolution=256):
# exporting GLB mesh
output_glb_path = os.path.join(config.output_dir, GRADIO_GLB_PATH)
if remesh == "No":
config.inference.remesh = False
elif remesh == "Yes":
config.inference.remesh = True
if uv_unwrap == "Faster":
config.inference.fast_unwrap = True
elif uv_unwrap == "Better":
config.inference.fast_unwrap = False
config.inference.mc_resolution = mc_resolution
config.inference.batch_size = 8192
model_primx.load_state_dict(prim_params)
model_primx.to(device)
model_primx.eval()
with torch.no_grad():
model_primx.srt_param[:, 1:4] *= 0.85
extract_texmesh(config.inference, model_primx, config.output_dir, device)
return output_glb_path, gr.update(visible=True), gr.update(interactive=True), gr.update(value="assets/hdri/metro_noord_1k.hdr")
# gradio UI
_TITLE = '''3DTopia-XL'''
_DESCRIPTION = '''
<div>
<a style="display:inline-block" href="https://frozenburning.github.io/projects/3DTopia-XL/"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a>
<a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/3DTopia-XL"><img src='https://img.shields.io/github/stars/3DTopia/3DTopia-XL?style=social'/></a>
</div>
* Now we offer 1) single image conditioned model, we will release 2) multiview images conditioned model and 3) pure text conditioned model in the future!
* If you find the output unsatisfying, try using different seeds!
'''
block = gr.Blocks(title=_TITLE).queue()
with block:
current_fg_state = gr.State()
prim_param_state = gr.State()
with gr.Row():
with gr.Column(scale=1):
gr.Markdown('# ' + _TITLE)
gr.Markdown(_DESCRIPTION)
with gr.Row(variant='panel'):
with gr.Column(scale=1):
with gr.Row():
# input image
input_image = gr.Image(label="image", type='pil')
# background removal
removal_previewer = gr.Image(label="Background Removal Preview", type='pil', interactive=False)
with gr.Row():
# inference steps
input_num_steps = gr.Radio(choices=[25, 50, 100, 200], label="DDIM steps", value=25, info="Larger for robustness but slower.")
# random seed
input_cfg = gr.Slider(label="CFG scale", minimum=0, maximum=15, step=0.5, value=6, info="Typically CFG in a range of 4-7")
# random seed
input_seed = gr.Slider(label="random seed", minimum=0, maximum=10000, step=1, value=42, info="Try different seed if the result is not satisfying as this is a generative model!")
with gr.Row():
input_mc_resolution = gr.Radio(choices=[128, 256], label="MC Resolution", value=128, info="Cube resolution for mesh extraction. Larger for better quality but slower.")
input_remesh = gr.Radio(choices=["No", "Yes"], label="Remesh", value="No", info="Remesh or not?")
input_unwrap = gr.Radio(choices=["Faster", "Better"], label="UV Unwrap", value="Better", info="UV unwrapping algorithm. Trade-off between quality and speed.")
# gen button
with gr.Row():
button_gen = gr.Button(value="Generate", interactive=False)
export_glb_btn = gr.Button(value="Export Current GLB", interactive=False)
with gr.Column(scale=1):
with gr.Row():
# final video results
output_rgb_video = gr.Video(label="RGB")
output_prim_video = gr.Video(label="Primitives")
output_mat_video = gr.Video(label="Material")
with gr.Row():
# glb file
output_glb = LitModel3D(
label="3D GLB Model",
visible=True,
clear_color=[0.0, 0.0, 0.0, 0.0],
camera_position=(90, None, None),
tonemapping="aces",
contrast=1.0,
scale=1.0,
)
with gr.Column(visible=False, scale=1.0) as hdr_row:
gr.Markdown("""## HDR Environment Map
Select / Upload an HDR environment map to relight the 3D model.
""")
with gr.Row():
example_hdris = [
os.path.join("assets/hdri", f)
for f in os.listdir("assets/hdri")
]
hdr_illumination_file = gr.File(
label="HDR Envmap", file_types=[".hdr"], file_count="single"
)
hdr_illumination_example = gr.Examples(
examples=example_hdris,
inputs=hdr_illumination_file,
)
hdr_illumination_file.change(
lambda x: gr.update(env_map=x.name if x is not None else None),
inputs=hdr_illumination_file,
outputs=[output_glb],
)
input_image.change(background_remove_process, inputs=[input_image], outputs=[button_gen, current_fg_state, removal_previewer])
button_gen.click(process, inputs=[current_fg_state, input_num_steps, input_seed, input_cfg], outputs=[output_rgb_video, output_prim_video, output_mat_video, prim_param_state])
prim_param_state.change(export_mesh, inputs=[prim_param_state, input_unwrap, input_remesh, input_mc_resolution], outputs=[output_glb, hdr_row, export_glb_btn, hdr_illumination_file])
export_glb_btn.click(export_mesh, inputs=[prim_param_state, input_unwrap, input_remesh, input_mc_resolution], outputs=[output_glb, hdr_row, export_glb_btn, hdr_illumination_file])
gr.Examples(
examples=[
os.path.join("assets/examples", f)
for f in os.listdir("assets/examples")
],
inputs=[input_image],
outputs=[output_rgb_video, output_prim_video, output_mat_video, prim_param_state],
fn=lambda x: process(input_image=x),
cache_examples=False,
label='Single Image to 3D PBR Asset'
)
block.launch(server_name="0.0.0.0", share=True) |