Spaces:

VAST-AI
/

TripoSG

Running on Zero

App Files Files Community

TripoSG / app.py

pookiefoof

Update app.py

d51c5f2 verified about 2 months ago

raw

history blame contribute delete

15.5 kB

	import spaces
	import os
	import gradio as gr
	import numpy as np
	import torch
	from PIL import Image
	import trimesh
	import random
	from transformers import AutoModelForImageSegmentation
	from torchvision import transforms
	from huggingface_hub import hf_hub_download, snapshot_download
	import subprocess
	import shutil

	# install others
	subprocess.run("pip install spandrel==0.4.1 --no-deps", shell=True, check=True)

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16

	print("DEVICE: ", DEVICE)

	DEFAULT_FACE_NUMBER = 100000
	MAX_SEED = np.iinfo(np.int32).max
	TRIPOSG_REPO_URL = "https://github.com/VAST-AI-Research/TripoSG.git"
	MV_ADAPTER_REPO_URL = "https://github.com/huanngzh/MV-Adapter.git"

	RMBG_PRETRAINED_MODEL = "checkpoints/RMBG-1.4"
	TRIPOSG_PRETRAINED_MODEL = "checkpoints/TripoSG"

	TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
	os.makedirs(TMP_DIR, exist_ok=True)

	TRIPOSG_CODE_DIR = "./triposg"
	if not os.path.exists(TRIPOSG_CODE_DIR):
	os.system(f"git clone {TRIPOSG_REPO_URL} {TRIPOSG_CODE_DIR}")

	MV_ADAPTER_CODE_DIR = "./mv_adapter"
	if not os.path.exists(MV_ADAPTER_CODE_DIR):
	os.system(f"git clone {MV_ADAPTER_REPO_URL} {MV_ADAPTER_CODE_DIR} && cd {MV_ADAPTER_CODE_DIR} && git checkout 7d37a97e9bc223cdb8fd26a76bd8dd46504c7c3d")

	import sys
	sys.path.append(TRIPOSG_CODE_DIR)
	sys.path.append(os.path.join(TRIPOSG_CODE_DIR, "scripts"))
	sys.path.append(MV_ADAPTER_CODE_DIR)
	sys.path.append(os.path.join(MV_ADAPTER_CODE_DIR, "scripts"))

	HEADER = """

	# 🔮 Image to 3D with [TripoSG](https://github.com/VAST-AI-Research/TripoSG)

	## State-of-the-art Open Source 3D Generation Using Large-Scale Rectified Flow Transformers

	<p style="font-size: 1.1em;">By <a href="https://www.tripo3d.ai/" style="color: #1E90FF; text-decoration: none; font-weight: bold;">Tripo</a></p>

	## 📋 Quick Start Guide:
	1. Upload an image (single object works best)
	2. Click Generate Shape to create the 3D mesh
	3. Click Apply Texture to add textures
	4. Use Download GLB to save your 3D model
	5. Adjust parameters under Generation Settings for fine-tuning

	Best results come from clean, well-lit images with clear subject isolation. Try it now!

	<p style="font-size: 0.9em; margin-top: 10px;">Texture generation powered by <a href="https://github.com/huanngzh/MV-Adapter" style="color: #1E90FF; text-decoration: none;">MV-Adapter</a> - a versatile multi-view adapter for consistent texture generation. Try the <a href="https://huggingface.co/spaces/VAST-AI/MV-Adapter-I2MV-SDXL" style="color: #1E90FF; text-decoration: none;">MV-Adapter demo</a> for multi-view image generation.</p>

	"""

	# # triposg
	from image_process import prepare_image
	from briarmbg import BriaRMBG
	snapshot_download("briaai/RMBG-1.4", local_dir=RMBG_PRETRAINED_MODEL)
	rmbg_net = BriaRMBG.from_pretrained(RMBG_PRETRAINED_MODEL).to(DEVICE)
	rmbg_net.eval()
	from triposg.pipelines.pipeline_triposg import TripoSGPipeline
	snapshot_download("VAST-AI/TripoSG", local_dir=TRIPOSG_PRETRAINED_MODEL)
	triposg_pipe = TripoSGPipeline.from_pretrained(TRIPOSG_PRETRAINED_MODEL).to(DEVICE, DTYPE)

	# mv adapter
	NUM_VIEWS = 6
	from inference_ig2mv_sdxl import prepare_pipeline, preprocess_image, remove_bg
	from mvadapter.utils import get_orthogonal_camera, tensor_to_image, make_image_grid
	from mvadapter.utils.render import NVDiffRastContextWrapper, load_mesh, render
	mv_adapter_pipe = prepare_pipeline(
	base_model="stabilityai/stable-diffusion-xl-base-1.0",
	vae_model="madebyollin/sdxl-vae-fp16-fix",
	unet_model=None,
	lora_model=None,
	adapter_path="huanngzh/mv-adapter",
	scheduler=None,
	num_views=NUM_VIEWS,
	device=DEVICE,
	dtype=torch.float16,
	)
	birefnet = AutoModelForImageSegmentation.from_pretrained(
	"ZhengPeng7/BiRefNet", trust_remote_code=True
	)
	birefnet.to(DEVICE)
	transform_image = transforms.Compose(
	[
	transforms.Resize((1024, 1024)),
	transforms.ToTensor(),
	transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)
	remove_bg_fn = lambda x: remove_bg(x, birefnet, transform_image, DEVICE)

	if not os.path.exists("checkpoints/RealESRGAN_x2plus.pth"):
	hf_hub_download("dtarnow/UPscaler", filename="RealESRGAN_x2plus.pth", local_dir="checkpoints")
	if not os.path.exists("checkpoints/big-lama.pt"):
	subprocess.run("wget -P checkpoints/ https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt", shell=True, check=True)

	def start_session(req: gr.Request):
	save_dir = os.path.join(TMP_DIR, str(req.session_hash))
	os.makedirs(save_dir, exist_ok=True)
	print("start session, mkdir", save_dir)

	def end_session(req: gr.Request):
	save_dir = os.path.join(TMP_DIR, str(req.session_hash))
	shutil.rmtree(save_dir)

	def get_random_hex():
	random_bytes = os.urandom(8)
	random_hex = random_bytes.hex()
	return random_hex

	def get_random_seed(randomize_seed, seed):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	return seed

	@spaces.GPU(duration=180)
	def run_full(image: str, req: gr.Request):
	seed = 0
	num_inference_steps = 50
	guidance_scale = 7.5
	simplify = True
	target_face_num = DEFAULT_FACE_NUMBER

	image_seg = prepare_image(image, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net)

	outputs = triposg_pipe(
	image=image_seg,
	generator=torch.Generator(device=triposg_pipe.device).manual_seed(seed),
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale
	).samples[0]
	print("mesh extraction done")
	mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1]))

	if simplify:
	print("start simplify")
	from utils import simplify_mesh
	mesh = simplify_mesh(mesh, target_face_num)

	save_dir = os.path.join(TMP_DIR, "examples")
	os.makedirs(save_dir, exist_ok=True)
	mesh_path = os.path.join(save_dir, f"triposg_{get_random_hex()}.glb")
	mesh.export(mesh_path)
	print("save to ", mesh_path)

	torch.cuda.empty_cache()

	height, width = 768, 768
	# Prepare cameras
	cameras = get_orthogonal_camera(
	elevation_deg=[0, 0, 0, 0, 89.99, -89.99],
	distance=[1.8] * NUM_VIEWS,
	left=-0.55,
	right=0.55,
	bottom=-0.55,
	top=0.55,
	azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
	device=DEVICE,
	)
	ctx = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda")

	mesh = load_mesh(mesh_path, rescale=True, device=DEVICE)
	render_out = render(
	ctx,
	mesh,
	cameras,
	height=height,
	width=width,
	render_attr=False,
	normal_background=0.0,
	)
	control_images = (
	torch.cat(
	[
	(render_out.pos + 0.5).clamp(0, 1),
	(render_out.normal / 2 + 0.5).clamp(0, 1),
	],
	dim=-1,
	)
	.permute(0, 3, 1, 2)
	.to(DEVICE)
	)

	image = Image.open(image)
	image = remove_bg_fn(image)
	image = preprocess_image(image, height, width)

	pipe_kwargs = {}
	if seed != -1 and isinstance(seed, int):
	pipe_kwargs["generator"] = torch.Generator(device=DEVICE).manual_seed(seed)

	images = mv_adapter_pipe(
	"high quality",
	height=height,
	width=width,
	num_inference_steps=15,
	guidance_scale=3.0,
	num_images_per_prompt=NUM_VIEWS,
	control_image=control_images,
	control_conditioning_scale=1.0,
	reference_image=image,
	reference_conditioning_scale=1.0,
	negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
	cross_attention_kwargs={"scale": 1.0},
	**pipe_kwargs,
	).images

	torch.cuda.empty_cache()

	mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png")
	make_image_grid(images, rows=1).save(mv_image_path)

	from texture import TexturePipeline, ModProcessConfig
	texture_pipe = TexturePipeline(
	upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth",
	inpaint_ckpt_path="checkpoints/big-lama.pt",
	device=DEVICE,
	)

	textured_glb_path = texture_pipe(
	mesh_path=mesh_path,
	save_dir=save_dir,
	save_name=f"texture_mesh_{get_random_hex()}.glb",
	uv_unwarp=True,
	uv_size=4096,
	rgb_path=mv_image_path,
	rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"),
	camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
	)

	return image_seg, mesh_path, textured_glb_path


	@spaces.GPU()
	@torch.no_grad()
	def run_segmentation(image: str):
	image = prepare_image(image, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net)
	return image

	@spaces.GPU(duration=90)
	@torch.no_grad()
	def image_to_3d(
	image: Image.Image,
	seed: int,
	num_inference_steps: int,
	guidance_scale: float,
	simplify: bool,
	target_face_num: int,
	req: gr.Request
	):
	outputs = triposg_pipe(
	image=image,
	generator=torch.Generator(device=triposg_pipe.device).manual_seed(seed),
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale
	).samples[0]
	print("mesh extraction done")
	mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1]))

	if simplify:
	print("start simplify")
	from utils import simplify_mesh
	mesh = simplify_mesh(mesh, target_face_num)

	save_dir = os.path.join(TMP_DIR, str(req.session_hash))
	mesh_path = os.path.join(save_dir, f"triposg_{get_random_hex()}.glb")
	mesh.export(mesh_path)
	print("save to ", mesh_path)

	torch.cuda.empty_cache()

	return mesh_path

	@spaces.GPU(duration=120)
	@torch.no_grad()
	def run_texture(image: Image, mesh_path: str, seed: int, req: gr.Request):
	height, width = 768, 768
	# Prepare cameras
	cameras = get_orthogonal_camera(
	elevation_deg=[0, 0, 0, 0, 89.99, -89.99],
	distance=[1.8] * NUM_VIEWS,
	left=-0.55,
	right=0.55,
	bottom=-0.55,
	top=0.55,
	azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
	device=DEVICE,
	)
	ctx = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda")

	mesh = load_mesh(mesh_path, rescale=True, device=DEVICE)
	render_out = render(
	ctx,
	mesh,
	cameras,
	height=height,
	width=width,
	render_attr=False,
	normal_background=0.0,
	)
	control_images = (
	torch.cat(
	[
	(render_out.pos + 0.5).clamp(0, 1),
	(render_out.normal / 2 + 0.5).clamp(0, 1),
	],
	dim=-1,
	)
	.permute(0, 3, 1, 2)
	.to(DEVICE)
	)

	image = Image.open(image)
	image = remove_bg_fn(image)
	image = preprocess_image(image, height, width)

	pipe_kwargs = {}
	if seed != -1 and isinstance(seed, int):
	pipe_kwargs["generator"] = torch.Generator(device=DEVICE).manual_seed(seed)

	images = mv_adapter_pipe(
	"high quality",
	height=height,
	width=width,
	num_inference_steps=15,
	guidance_scale=3.0,
	num_images_per_prompt=NUM_VIEWS,
	control_image=control_images,
	control_conditioning_scale=1.0,
	reference_image=image,
	reference_conditioning_scale=1.0,
	negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
	cross_attention_kwargs={"scale": 1.0},
	**pipe_kwargs,
	).images

	torch.cuda.empty_cache()

	save_dir = os.path.join(TMP_DIR, str(req.session_hash))
	mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png")
	make_image_grid(images, rows=1).save(mv_image_path)

	from texture import TexturePipeline, ModProcessConfig
	texture_pipe = TexturePipeline(
	upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth",
	inpaint_ckpt_path="checkpoints/big-lama.pt",
	device=DEVICE,
	)

	textured_glb_path = texture_pipe(
	mesh_path=mesh_path,
	save_dir=save_dir,
	save_name=f"texture_mesh_{get_random_hex()}.glb",
	uv_unwarp=True,
	uv_size=4096,
	rgb_path=mv_image_path,
	rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"),
	camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]],
	)

	return textured_glb_path


	with gr.Blocks(title="TripoSG") as demo:
	gr.Markdown(HEADER)

	with gr.Row():
	with gr.Column():
	with gr.Row():
	image_prompts = gr.Image(label="Input Image", type="filepath")
	seg_image = gr.Image(
	label="Segmentation Result", type="pil", format="png", interactive=False
	)

	with gr.Accordion("Generation Settings", open=True):
	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=MAX_SEED,
	step=0,
	value=0
	)
	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=8,
	maximum=50,
	step=1,
	value=50,
	)
	guidance_scale = gr.Slider(
	label="CFG scale",
	minimum=0.0,
	maximum=20.0,
	step=0.1,
	value=7.0,
	)

	with gr.Row():
	reduce_face = gr.Checkbox(label="Simplify Mesh", value=True)
	target_face_num = gr.Slider(maximum=1000000, minimum=10000, value=DEFAULT_FACE_NUMBER, label="Target Face Number")

	gen_button = gr.Button("Generate Shape", variant="primary")
	gen_texture_button = gr.Button("Apply Texture", interactive=False)

	with gr.Column():
	model_output = gr.Model3D(label="Generated GLB", interactive=False)
	textured_model_output = gr.Model3D(label="Textured GLB", interactive=False)

	with gr.Row():
	examples = gr.Examples(
	examples=[
	f"{TRIPOSG_CODE_DIR}/assets/example_data/{image}"
	for image in os.listdir(f"{TRIPOSG_CODE_DIR}/assets/example_data")
	],
	fn=run_full,
	inputs=[image_prompts],
	outputs=[seg_image, model_output, textured_model_output],
	cache_examples=True,
	)

	gen_button.click(
	run_segmentation,
	inputs=[image_prompts],
	outputs=[seg_image]
	).then(
	get_random_seed,
	inputs=[randomize_seed, seed],
	outputs=[seed],
	).then(
	image_to_3d,
	inputs=[
	seg_image,
	seed,
	num_inference_steps,
	guidance_scale,
	reduce_face,
	target_face_num
	],
	outputs=[model_output]
	).then(lambda: gr.Button(interactive=True), outputs=[gen_texture_button])

	gen_texture_button.click(
	run_texture,
	inputs=[image_prompts, model_output, seed],
	outputs=[textured_model_output]
	)

	demo.load(start_session)
	demo.unload(end_session)

	demo.launch()