Spaces:

3DAIGC
/

LHM

Running on Zero

App Files Files Community

LHM / app.py

DyrusQZ

fix app.py

8485652 2 months ago

raw

history blame contribute delete

33.4 kB

	# Copyright (c) 2023-2024, Qi Zuo
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import os
	os.system("rm -rf /data-nvme/zerogpu-offload/")
	os.system("pip install chumpy")
	os.system("pip uninstall -y basicsr")
	os.system("pip install git+https://github.com/hitsz-zuoqi/BasicSR/")
	os.system("pip install numpy==1.23.0")
	os.system("pip install ./wheels_new/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall")
	os.system("pip install ./wheels/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall")
	os.system("pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt240/download.html")
	import cv2
	import time
	from PIL import Image
	import numpy as np
	import gradio as gr
	import base64
	import spaces
	import torch
	torch._dynamo.config.disable = True
	import subprocess
	import os
	import argparse
	from omegaconf import OmegaConf
	from rembg import remove
	from engine.pose_estimation.pose_estimator import PoseEstimator
	from LHM.utils.face_detector import VGGHeadDetector
	from LHM.utils.hf_hub import wrap_model_hub
	from LHM.runners.infer.utils import (
	calc_new_tgt_size_by_aspect,
	center_crop_according_to_mask,
	prepare_motion_seqs,
	resize_image_keepaspect_np,
	)
	from LHM.utils.ffmpeg_utils import images_to_video
	from engine.SegmentAPI.base import Bbox

	def get_bbox(mask):
	height, width = mask.shape
	pha = mask / 255.0
	pha[pha < 0.5] = 0.0
	pha[pha >= 0.5] = 1.0

	# obtain bbox
	_h, _w = np.where(pha == 1)

	whwh = [
	_w.min().item(),
	_h.min().item(),
	_w.max().item(),
	_h.max().item(),
	]

	box = Bbox(whwh)

	# scale box to 1.05
	scale_box = box.scale(1.1, width=width, height=height)
	return scale_box

	# def infer_preprocess_image(
	# rgb_path,
	# mask,
	# intr,
	# pad_ratio,
	# bg_color,
	# max_tgt_size,
	# aspect_standard,
	# enlarge_ratio,
	# render_tgt_size,
	# multiply,
	# need_mask=True,
	# ):
	# """inferece
	# image, _, _ = preprocess_image(image_path, mask_path=None, intr=None, pad_ratio=0, bg_color=1.0,
	# max_tgt_size=896, aspect_standard=aspect_standard, enlarge_ratio=[1.0, 1.0],
	# render_tgt_size=source_size, multiply=14, need_mask=True)

	# """

	# rgb = np.array(Image.open(rgb_path))
	# rgb_raw = rgb.copy()

	# bbox = get_bbox(mask)
	# bbox_list = bbox.get_box()

	# rgb = rgb[bbox_list[1] : bbox_list[3], bbox_list[0] : bbox_list[2]]
	# mask = mask[bbox_list[1] : bbox_list[3], bbox_list[0] : bbox_list[2]]

	# h, w, _ = rgb.shape
	# assert w < h
	# cur_ratio = h / w
	# scale_ratio = cur_ratio / aspect_standard

	# target_w = int(min(w * scale_ratio, h))
	# offset_w = (target_w - w) // 2
	# # resize to target ratio.
	# if offset_w > 0:
	# rgb = np.pad(
	# rgb,
	# ((0, 0), (offset_w, offset_w), (0, 0)),
	# mode="constant",
	# constant_values=255,
	# )
	# mask = np.pad(
	# mask,
	# ((0, 0), (offset_w, offset_w)),
	# mode="constant",
	# constant_values=0,
	# )
	# else:
	# offset_w = -offset_w
	# rgb = rgb[:,offset_w:-offset_w,:]
	# mask = mask[:,offset_w:-offset_w]

	# # resize to target ratio.

	# rgb = np.pad(
	# rgb,
	# ((0, 0), (offset_w, offset_w), (0, 0)),
	# mode="constant",
	# constant_values=255,
	# )

	# mask = np.pad(
	# mask,
	# ((0, 0), (offset_w, offset_w)),
	# mode="constant",
	# constant_values=0,
	# )

	# rgb = rgb / 255.0 # normalize to [0, 1]
	# mask = mask / 255.0

	# mask = (mask > 0.5).astype(np.float32)
	# rgb = rgb[:, :, :3] * mask[:, :, None] + bg_color * (1 - mask[:, :, None])

	# # resize to specific size require by preprocessor of smplx-estimator.
	# rgb = resize_image_keepaspect_np(rgb, max_tgt_size)
	# mask = resize_image_keepaspect_np(mask, max_tgt_size)

	# # crop image to enlarge human area.
	# rgb, mask, offset_x, offset_y = center_crop_according_to_mask(
	# rgb, mask, aspect_standard, enlarge_ratio
	# )
	# if intr is not None:
	# intr[0, 2] -= offset_x
	# intr[1, 2] -= offset_y

	# # resize to render_tgt_size for training

	# tgt_hw_size, ratio_y, ratio_x = calc_new_tgt_size_by_aspect(
	# cur_hw=rgb.shape[:2],
	# aspect_standard=aspect_standard,
	# tgt_size=render_tgt_size,
	# multiply=multiply,
	# )

	# rgb = cv2.resize(
	# rgb, dsize=(tgt_hw_size[1], tgt_hw_size[0]), interpolation=cv2.INTER_AREA
	# )
	# mask = cv2.resize(
	# mask, dsize=(tgt_hw_size[1], tgt_hw_size[0]), interpolation=cv2.INTER_AREA
	# )

	# if intr is not None:

	# # ****************** Merge ********************* #
	# intr = scale_intrs(intr, ratio_x=ratio_x, ratio_y=ratio_y)
	# assert (
	# abs(intr[0, 2] * 2 - rgb.shape[1]) < 2.5
	# ), f"{intr[0, 2] * 2}, {rgb.shape[1]}"
	# assert (
	# abs(intr[1, 2] * 2 - rgb.shape[0]) < 2.5
	# ), f"{intr[1, 2] * 2}, {rgb.shape[0]}"

	# # ****************** Merge ********************* #
	# intr[0, 2] = rgb.shape[1] // 2
	# intr[1, 2] = rgb.shape[0] // 2

	# rgb = torch.from_numpy(rgb).float().permute(2, 0, 1).unsqueeze(0) # [1, 3, H, W]
	# mask = (
	# torch.from_numpy(mask[:, :, None]).float().permute(2, 0, 1).unsqueeze(0)
	# ) # [1, 1, H, W]
	# return rgb, mask, intr

	def infer_preprocess_image(
	rgb_path,
	mask,
	intr,
	pad_ratio,
	bg_color,
	max_tgt_size,
	aspect_standard,
	enlarge_ratio,
	render_tgt_size,
	multiply,
	need_mask=True,
	):
	"""inferece
	image, _, _ = preprocess_image(image_path, mask_path=None, intr=None, pad_ratio=0, bg_color=1.0,
	max_tgt_size=896, aspect_standard=aspect_standard, enlarge_ratio=[1.0, 1.0],
	render_tgt_size=source_size, multiply=14, need_mask=True)

	"""

	rgb = np.array(Image.open(rgb_path))
	rgb_raw = rgb.copy()

	bbox = get_bbox(mask)
	bbox_list = bbox.get_box()

	rgb = rgb[bbox_list[1] : bbox_list[3], bbox_list[0] : bbox_list[2]]
	mask = mask[bbox_list[1] : bbox_list[3], bbox_list[0] : bbox_list[2]]


	h, w, _ = rgb.shape
	assert w < h
	cur_ratio = h / w
	scale_ratio = cur_ratio / aspect_standard


	target_w = int(min(w * scale_ratio, h))
	if target_w - w >0:
	offset_w = (target_w - w) // 2

	rgb = np.pad(
	rgb,
	((0, 0), (offset_w, offset_w), (0, 0)),
	mode="constant",
	constant_values=255,
	)

	mask = np.pad(
	mask,
	((0, 0), (offset_w, offset_w)),
	mode="constant",
	constant_values=0,
	)
	else:
	target_h = w * aspect_standard
	offset_h = int(target_h - h)

	rgb = np.pad(
	rgb,
	((offset_h, 0), (0, 0), (0, 0)),
	mode="constant",
	constant_values=255,
	)

	mask = np.pad(
	mask,
	((offset_h, 0), (0, 0)),
	mode="constant",
	constant_values=0,
	)

	rgb = rgb / 255.0 # normalize to [0, 1]
	mask = mask / 255.0

	mask = (mask > 0.5).astype(np.float32)
	rgb = rgb[:, :, :3] * mask[:, :, None] + bg_color * (1 - mask[:, :, None])

	# resize to specific size require by preprocessor of smplx-estimator.
	rgb = resize_image_keepaspect_np(rgb, max_tgt_size)
	mask = resize_image_keepaspect_np(mask, max_tgt_size)

	# crop image to enlarge human area.
	rgb, mask, offset_x, offset_y = center_crop_according_to_mask(
	rgb, mask, aspect_standard, enlarge_ratio
	)
	if intr is not None:
	intr[0, 2] -= offset_x
	intr[1, 2] -= offset_y

	# resize to render_tgt_size for training

	tgt_hw_size, ratio_y, ratio_x = calc_new_tgt_size_by_aspect(
	cur_hw=rgb.shape[:2],
	aspect_standard=aspect_standard,
	tgt_size=render_tgt_size,
	multiply=multiply,
	)

	rgb = cv2.resize(
	rgb, dsize=(tgt_hw_size[1], tgt_hw_size[0]), interpolation=cv2.INTER_AREA
	)
	mask = cv2.resize(
	mask, dsize=(tgt_hw_size[1], tgt_hw_size[0]), interpolation=cv2.INTER_AREA
	)

	if intr is not None:

	# ****************** Merge ********************* #
	intr = scale_intrs(intr, ratio_x=ratio_x, ratio_y=ratio_y)
	assert (
	abs(intr[0, 2] * 2 - rgb.shape[1]) < 2.5
	), f"{intr[0, 2] * 2}, {rgb.shape[1]}"
	assert (
	abs(intr[1, 2] * 2 - rgb.shape[0]) < 2.5
	), f"{intr[1, 2] * 2}, {rgb.shape[0]}"

	# ****************** Merge ********************* #
	intr[0, 2] = rgb.shape[1] // 2
	intr[1, 2] = rgb.shape[0] // 2

	rgb = torch.from_numpy(rgb).float().permute(2, 0, 1).unsqueeze(0) # [1, 3, H, W]
	mask = (
	torch.from_numpy(mask[:, :, None]).float().permute(2, 0, 1).unsqueeze(0)
	) # [1, 1, H, W]
	return rgb, mask, intr

	def parse_configs():

	parser = argparse.ArgumentParser()
	parser.add_argument("--config", type=str)
	parser.add_argument("--infer", type=str)
	args, unknown = parser.parse_known_args()

	cfg = OmegaConf.create()
	cli_cfg = OmegaConf.from_cli(unknown)

	# parse from ENV
	if os.environ.get("APP_INFER") is not None:
	args.infer = os.environ.get("APP_INFER")
	if os.environ.get("APP_MODEL_NAME") is not None:
	cli_cfg.model_name = os.environ.get("APP_MODEL_NAME")

	args.config = args.infer if args.config is None else args.config

	if args.config is not None:
	cfg_train = OmegaConf.load(args.config)
	cfg.source_size = cfg_train.dataset.source_image_res
	try:
	cfg.src_head_size = cfg_train.dataset.src_head_size
	except:
	cfg.src_head_size = 112
	cfg.render_size = cfg_train.dataset.render_image.high
	_relative_path = os.path.join(
	cfg_train.experiment.parent,
	cfg_train.experiment.child,
	os.path.basename(cli_cfg.model_name).split("_")[-1],
	)

	cfg.save_tmp_dump = os.path.join("exps", "save_tmp", _relative_path)
	cfg.image_dump = os.path.join("exps", "images", _relative_path)
	cfg.video_dump = os.path.join("exps", "videos", _relative_path) # output path

	if args.infer is not None:
	cfg_infer = OmegaConf.load(args.infer)
	cfg.merge_with(cfg_infer)
	cfg.setdefault(
	"save_tmp_dump", os.path.join("exps", cli_cfg.model_name, "save_tmp")
	)
	cfg.setdefault("image_dump", os.path.join("exps", cli_cfg.model_name, "images"))
	cfg.setdefault(
	"video_dump", os.path.join("dumps", cli_cfg.model_name, "videos")
	)
	cfg.setdefault("mesh_dump", os.path.join("dumps", cli_cfg.model_name, "meshes"))

	cfg.motion_video_read_fps = 6
	cfg.merge_with(cli_cfg)

	cfg.setdefault("logger", "INFO")

	assert cfg.model_name is not None, "model_name is required"

	return cfg, cfg_train

	def _build_model(cfg):
	from LHM.models import model_dict

	hf_model_cls = wrap_model_hub(model_dict["human_lrm_sapdino_bh_sd3_5"])
	model = hf_model_cls.from_pretrained(cfg.model_name)

	return model

	def launch_pretrained():
	from huggingface_hub import snapshot_download, hf_hub_download
	hf_hub_download(repo_id="3DAIGC/LHM", repo_type='model', filename='assets.tar', local_dir="./")
	os.system("tar -xf assets.tar && rm assets.tar")
	# hf_hub_download(repo_id="3DAIGC/LHM", repo_type='model', filename='LHM-0.5B.tar', local_dir="./")
	# os.system("tar -xf LHM-0.5B.tar && rm LHM-0.5B.tar")
	hf_hub_download(repo_id="3DAIGC/LHM", repo_type='model', filename='LHM_prior_model.tar', local_dir="./")
	os.system("tar -xf LHM_prior_model.tar && rm LHM_prior_model.tar")
	# replace the weight of full body
	hf_hub_download(repo_id="3DAIGC/LHM-500M-HF", repo_type='model', filename='config.json', local_dir="./exps/releases/video_human_benchmark/human-lrm-500M/step_060000/")
	hf_hub_download(repo_id="3DAIGC/LHM-500M-HF", repo_type='model', filename='model.safetensors', local_dir="./exps/releases/video_human_benchmark/human-lrm-500M/step_060000/")

	def launch_env_not_compile_with_cuda():
	os.system("pip install chumpy")
	os.system("pip uninstall -y basicsr")
	os.system("pip install git+https://github.com/hitsz-zuoqi/BasicSR/")
	os.system("pip install numpy==1.23.0")

	def animation_infer(renderer, gs_model_list, query_points, smplx_params, render_c2ws, render_intrs, render_bg_colors):
	'''Inference code avoid repeat forward.
	'''
	render_h, render_w = int(render_intrs[0, 0, 1, 2] * 2), int(
	render_intrs[0, 0, 0, 2] * 2
	)
	# render target views
	render_res_list = []
	num_views = render_c2ws.shape[1]
	start_time = time.time()

	# render target views
	render_res_list = []

	for view_idx in range(num_views):
	render_res = renderer.forward_animate_gs(
	gs_model_list,
	query_points,
	renderer.get_single_view_smpl_data(smplx_params, view_idx),
	render_c2ws[:, view_idx : view_idx + 1],
	render_intrs[:, view_idx : view_idx + 1],
	render_h,
	render_w,
	render_bg_colors[:, view_idx : view_idx + 1],
	)
	render_res_list.append(render_res)
	print(
	f"time elpased(animate gs model per frame):{(time.time() - start_time)/num_views}"
	)

	out = defaultdict(list)
	for res in render_res_list:
	for k, v in res.items():
	if isinstance(v[0], torch.Tensor):
	out[k].append(v.detach().cpu())
	else:
	out[k].append(v)
	for k, v in out.items():
	# print(f"out key:{k}")
	if isinstance(v[0], torch.Tensor):
	out[k] = torch.concat(v, dim=1)
	if k in ["comp_rgb", "comp_mask", "comp_depth"]:
	out[k] = out[k][0].permute(
	0, 2, 3, 1
	) # [1, Nv, 3, H, W] -> [Nv, 3, H, W] - > [Nv, H, W, 3]
	else:
	out[k] = v
	return out

	def assert_input_image(input_image):
	if input_image is None:
	raise gr.Error("No image selected or uploaded!")

	def prepare_working_dir():
	import tempfile
	working_dir = tempfile.TemporaryDirectory()
	return working_dir

	def init_preprocessor():
	from LHM.utils.preprocess import Preprocessor
	global preprocessor
	preprocessor = Preprocessor()

	def preprocess_fn(image_in: np.ndarray, remove_bg: bool, recenter: bool, working_dir):
	image_raw = os.path.join(working_dir.name, "raw.png")
	with Image.fromarray(image_in) as img:
	img.save(image_raw)
	image_out = os.path.join(working_dir.name, "rembg.png")
	success = preprocessor.preprocess(image_path=image_raw, save_path=image_out, rmbg=remove_bg, recenter=recenter)
	assert success, f"Failed under preprocess_fn!"
	return image_out

	def get_image_base64(path):
	with open(path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode()
	return f"data:image/png;base64,{encoded_string}"


	def demo_lhm(pose_estimator, face_detector, lhm, cfg):

	@spaces.GPU(duration=80)
	def core_fn(image: str, video_params, working_dir):
	# os.system("nvidia-smi")
	image_raw = os.path.join(working_dir.name, "raw.png")
	with Image.fromarray(image) as img:
	img.save(image_raw)

	base_vid = os.path.basename(video_params).split("_")[0]
	smplx_params_dir = os.path.join("./assets/sample_motion", base_vid, "smplx_params")

	dump_video_path = os.path.join(working_dir.name, "output.mp4")
	dump_image_path = os.path.join(working_dir.name, "output.png")
	dump_model_path = os.path.join(working_dir.name, "output.ply")


	# prepare dump paths
	omit_prefix = os.path.dirname(image_raw)
	image_name = os.path.basename(image_raw)
	uid = image_name.split(".")[0]
	subdir_path = os.path.dirname(image_raw).replace(omit_prefix, "")
	subdir_path = (
	subdir_path[1:] if subdir_path.startswith("/") else subdir_path
	)
	print("subdir_path and uid:", subdir_path, uid)

	motion_seqs_dir = smplx_params_dir

	motion_name = os.path.dirname(
	motion_seqs_dir[:-1] if motion_seqs_dir[-1] == "/" else motion_seqs_dir
	)

	motion_name = os.path.basename(motion_name)

	dump_image_dir = os.path.dirname(dump_image_path)
	os.makedirs(dump_image_dir, exist_ok=True)

	print(image_raw, motion_seqs_dir, dump_image_dir, dump_video_path)

	dump_tmp_dir = dump_image_dir

	shape_pose = pose_estimator(image_raw)
	assert shape_pose.is_full_body, f"The input image is illegal, {shape_pose.msg}"

	if os.path.exists(dump_video_path):
	return dump_image_path, dump_video_path
	source_size = cfg.source_size
	render_size = cfg.render_size
	render_fps = 30

	aspect_standard = 5.0 / 3
	motion_img_need_mask = cfg.get("motion_img_need_mask", False) # False
	vis_motion = cfg.get("vis_motion", False) # False


	input_np = cv2.imread(image_raw)
	output_np = remove(input_np)
	# cv2.imwrite("./vis.png", output_np)
	parsing_mask = output_np[:,:,3]

	# prepare reference image
	image, _, _ = infer_preprocess_image(
	image_raw,
	mask=parsing_mask,
	intr=None,
	pad_ratio=0,
	bg_color=1.0,
	max_tgt_size=896,
	aspect_standard=aspect_standard,
	enlarge_ratio=[1.0, 1.0],
	render_tgt_size=source_size,
	multiply=14,
	need_mask=True,
	)

	try:
	rgb = np.array(Image.open(image_path))
	rgb = torch.from_numpy(rgb).permute(2, 0, 1)
	bbox = face_detector.detect_face(rgb)
	head_rgb = rgb[:, int(bbox[1]) : int(bbox[3]), int(bbox[0]) : int(bbox[2])]
	head_rgb = head_rgb.permute(1, 2, 0)
	src_head_rgb = head_rgb.cpu().numpy()
	except:
	print("w/o head input!")
	src_head_rgb = np.zeros((112, 112, 3), dtype=np.uint8)

	# resize to dino size
	try:
	src_head_rgb = cv2.resize(
	src_head_rgb,
	dsize=(cfg.src_head_size, cfg.src_head_size),
	interpolation=cv2.INTER_AREA,
	) # resize to dino size
	except:
	src_head_rgb = np.zeros(
	(cfg.src_head_size, cfg.src_head_size, 3), dtype=np.uint8
	)

	src_head_rgb = (
	torch.from_numpy(src_head_rgb / 255.0).float().permute(2, 0, 1).unsqueeze(0)
	) # [1, 3, H, W]

	save_ref_img_path = os.path.join(
	dump_tmp_dir, "output.png"
	)
	vis_ref_img = (image[0].permute(1, 2, 0).cpu().detach().numpy() * 255).astype(
	np.uint8
	)
	Image.fromarray(vis_ref_img).save(save_ref_img_path)

	# read motion seq
	motion_name = os.path.dirname(
	motion_seqs_dir[:-1] if motion_seqs_dir[-1] == "/" else motion_seqs_dir
	)
	motion_name = os.path.basename(motion_name)

	motion_seq = prepare_motion_seqs(
	motion_seqs_dir,
	None,
	save_root=dump_tmp_dir,
	fps=30,
	bg_color=1.0,
	aspect_standard=aspect_standard,
	enlarge_ratio=[1.0, 1, 0],
	render_image_res=render_size,
	multiply=16,
	need_mask=motion_img_need_mask,
	vis_motion=vis_motion,
	)

	camera_size = len(motion_seq["motion_seqs"])
	shape_param = shape_pose.beta

	device = "cuda"
	dtype = torch.float32
	shape_param = torch.tensor(shape_param, dtype=dtype).unsqueeze(0)

	lhm.to(dtype)

	smplx_params = motion_seq['smplx_params']
	smplx_params['betas'] = shape_param.to(device)

	gs_model_list, query_points, transform_mat_neutral_pose = lhm.infer_single_view(
	image.unsqueeze(0).to(device, dtype),
	src_head_rgb.unsqueeze(0).to(device, dtype),
	None,
	None,
	render_c2ws=motion_seq["render_c2ws"].to(device),
	render_intrs=motion_seq["render_intrs"].to(device),
	render_bg_colors=motion_seq["render_bg_colors"].to(device),
	smplx_params={
	k: v.to(device) for k, v in smplx_params.items()
	},
	)

	# # export ply model
	# print(dump_model_path)
	# gs_model_list[0].save_ply(dump_model_path)

	# rendering !!!!
	start_time = time.time()
	batch_dict = dict()
	batch_size = 80 # avoid memeory out!

	for batch_i in range(0, camera_size, batch_size):
	with torch.no_grad():
	# TODO check device and dtype
	# dict_keys(['comp_rgb', 'comp_rgb_bg', 'comp_mask', 'comp_depth', '3dgs'])
	keys = [
	"root_pose",
	"body_pose",
	"jaw_pose",
	"leye_pose",
	"reye_pose",
	"lhand_pose",
	"rhand_pose",
	"trans",
	"focal",
	"princpt",
	"img_size_wh",
	"expr",
	]
	batch_smplx_params = dict()
	batch_smplx_params["betas"] = shape_param.to(device)
	batch_smplx_params['transform_mat_neutral_pose'] = transform_mat_neutral_pose
	for key in keys:
	batch_smplx_params[key] = motion_seq["smplx_params"][key][
	:, batch_i : batch_i + batch_size
	].to(device)

	res = lhm.animation_infer(gs_model_list, query_points, batch_smplx_params,
	render_c2ws=motion_seq["render_c2ws"][
	:, batch_i : batch_i + batch_size
	].to(device),
	render_intrs=motion_seq["render_intrs"][
	:, batch_i : batch_i + batch_size
	].to(device),
	render_bg_colors=motion_seq["render_bg_colors"][
	:, batch_i : batch_i + batch_size
	].to(device),
	)

	for accumulate_key in ["comp_rgb", "comp_mask"]:
	if accumulate_key not in batch_dict:
	batch_dict[accumulate_key] = []
	batch_dict[accumulate_key].append(res[accumulate_key].detach().cpu())
	del res
	torch.cuda.empty_cache()

	for accumulate_key in ["comp_rgb", "comp_mask"]:
	batch_dict[accumulate_key] = torch.cat(batch_dict[accumulate_key], dim=0)

	print(f"time elapsed: {time.time() - start_time}")
	rgb = batch_dict["comp_rgb"].detach().cpu().numpy() # [Nv, H, W, 3], 0-1
	mask = batch_dict["comp_mask"].detach().cpu().numpy() # [Nv, H, W, 3], 0-1
	mask[mask < 0.5] = 0.0

	rgb = rgb * mask + (1 - mask) * 1
	rgb = np.clip(rgb * 255, 0, 255).astype(np.uint8)

	if vis_motion:
	# print(rgb.shape, motion_seq["vis_motion_render"].shape)

	vis_ref_img = np.tile(
	cv2.resize(vis_ref_img, (rgb[0].shape[1], rgb[0].shape[0]))[
	None, :, :, :
	],
	(rgb.shape[0], 1, 1, 1),
	)
	rgb = np.concatenate(
	[rgb, motion_seq["vis_motion_render"], vis_ref_img], axis=2
	)

	os.makedirs(os.path.dirname(dump_video_path), exist_ok=True)

	images_to_video(
	rgb,
	output_path=dump_video_path,
	fps=30,
	gradio_codec=False,
	verbose=True,
	)



	return dump_image_path, dump_video_path
	# return rgb, dump_image_path, dump_video_path

	# def core_fn_export(image, video_params, working_dir):
	# rgb, dump_image_path, dump_video_path = core_fn(image=image, video_params=video_params, working_dir=working_dir)
	# print("start to export the video.")
	# images_to_video(
	# rgb,
	# output_path=dump_video_path,
	# fps=30,
	# gradio_codec=False,
	# verbose=True,
	# )

	# return dump_image_path, dump_video_path

	_TITLE = '''LHM: Large Animatable Human Model'''

	_DESCRIPTION = '''
	<strong>Reconstruct a human avatar in 0.2 seconds with A100!</strong>
	'''

	with gr.Blocks(analytics_enabled=False, delete_cache=[3600,3600]) as demo:

	# </div>
	logo_url = "./assets/rgba_logo_new.png"
	logo_base64 = get_image_base64(logo_url)
	gr.HTML(
	f"""
	<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
	<div>
	<h1> <img src="{logo_base64}" style='height:35px; display:inline-block;'/> Large Animatable Human Model </h1>
	</div>
	</div>
	"""
	)


	gr.HTML(
	"""
	<div style="display: flex; justify-content: center; align-items: center; text-align: center; margin: 20px; gap: 10px;">
	<a class="flex-item" href="https://arxiv.org/abs/2503.10625" target="_blank">
	<img src="https://img.shields.io/badge/Paper-arXiv-darkred.svg" alt="arXiv Paper">
	</a>
	<a class="flex-item" href="https://lingtengqiu.github.io/LHM/" target="_blank">
	<img src="https://img.shields.io/badge/Project-LHM-blue" alt="Project Page">
	</a>
	<a class="flex-item" href="https://github.com/aigc3d/LHM" target="_blank">
	<img src="https://img.shields.io/github/stars/aigc3d/LHM?label=Github%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
	</a>
	<a class="flex-item" href="https://www.youtube.com/watch?v=tivEpz_yiEo" target="_blank">
	<img src="https://img.shields.io/badge/Youtube-Video-red.svg" alt="Video">
	</a>
	</div>
	"""
	)

	gr.HTML(
	"""<p><h4 style="color: red;"> Notes 1: Glad to tell you that we have supported both full-body or half-body input! Try to test the robustness with half-body images!.</h4></p>"""
	)
	gr.HTML(
	"""<p><h4 style="color: green;"> Notes 2: We drop ComfyUI Nodes of LHM on https://github.com/aigc3d/LHM/tree/feat/comfyui which support any character and any driven videos as input. Try it!</h4></p>"""
	)

	# DISPLAY
	with gr.Row():

	with gr.Column(variant='panel', scale=1):
	with gr.Tabs(elem_id="openlrm_input_image"):
	with gr.TabItem('Input Image'):
	with gr.Row():
	input_image = gr.Image(label="Input Image", image_mode="RGBA", height=480, width=270, sources="upload", type="numpy", elem_id="content_image")
	# EXAMPLES
	with gr.Row():
	examples = [
	['assets/sample_input/joker.jpg'],
	['assets/sample_input/anime.png'],
	['assets/sample_input/basket.png'],
	['assets/sample_input/ai_woman1.JPG'],
	['assets/sample_input/anime2.JPG'],
	['assets/sample_input/anime3.JPG'],
	['assets/sample_input/boy1.png'],
	['assets/sample_input/choplin.jpg'],
	['assets/sample_input/eins.JPG'],
	['assets/sample_input/girl1.png'],
	['assets/sample_input/girl2.png'],
	['assets/sample_input/robot.jpg'],
	]
	gr.Examples(
	examples=examples,
	inputs=[input_image],
	examples_per_page=20,
	)
	with gr.Column():
	with gr.Tabs(elem_id="openlrm_input_video"):
	with gr.TabItem('Input Video'):
	with gr.Row():
	video_input = gr.Video(label="Input Video",height=480, width=270, interactive=False)

	examples = [
	# './assets/sample_motion/danaotiangong/danaotiangong_origin.mp4',
	'./assets/sample_motion/ex5/ex5_origin.mp4',
	# './assets/sample_motion/girl2/girl2_origin.mp4',
	# './assets/sample_motion/jntm/jntm_origin.mp4',
	'./assets/sample_motion/mimo1/mimo1_origin.mp4',
	'./assets/sample_motion/mimo2/mimo2_origin.mp4',
	'./assets/sample_motion/mimo4/mimo4_origin.mp4',
	'./assets/sample_motion/mimo5/mimo5_origin.mp4',
	'./assets/sample_motion/mimo6/mimo6_origin.mp4',
	'./assets/sample_motion/nezha/nezha_origin.mp4',
	'./assets/sample_motion/taiji/taiji_origin.mp4'
	]

	gr.Examples(
	examples=examples,
	inputs=[video_input],
	examples_per_page=20,
	)
	with gr.Column(variant='panel', scale=1):
	with gr.Tabs(elem_id="openlrm_processed_image"):
	with gr.TabItem('Processed Image'):
	with gr.Row():
	processed_image = gr.Image(label="Processed Image", image_mode="RGBA", type="filepath", elem_id="processed_image", height=480, width=270, interactive=False)

	# SETTING
	with gr.Row():
	with gr.Column(variant='panel', scale=1):
	submit = gr.Button('Generate', elem_id="openlrm_generate", variant='primary')

	# show video && ply model
	with gr.Row():
	# with gr.Column(variant='panel', scale=1):
	# with gr.Tabs(elem_id="openlrm_render_model"):
	# with gr.TabItem('Rendered 3D Model'):
	# with gr.Row():
	# output_model = gr.Model3D(label="Rendered 3D Model")

	with gr.Column(variant='panel', scale=1):
	with gr.Tabs(elem_id="openlrm_render_video"):
	with gr.TabItem('Rendered Video'):
	with gr.Row():
	output_video = gr.Video(label="Rendered Video", format="mp4", height=480, width=270, autoplay=True)

	working_dir = gr.State()
	submit.click(
	fn=assert_input_image,
	inputs=[input_image],
	queue=False,
	).success(
	fn=prepare_working_dir,
	outputs=[working_dir],
	queue=False,
	).success(
	fn=core_fn,
	inputs=[input_image, video_input, working_dir], # video_params refer to smpl dir
	outputs=[processed_image, output_video],
	)

	demo.queue(max_size=1)
	demo.launch()


	def launch_gradio_app():

	os.environ.update({
	"APP_ENABLED": "1",
	"APP_MODEL_NAME": "./exps/releases/video_human_benchmark/human-lrm-500M/step_060000/",
	"APP_INFER": "./configs/inference/human-lrm-500M.yaml",
	"APP_TYPE": "infer.human_lrm",
	"NUMBA_THREADING_LAYER": 'omp',
	})

	# from LHM.runners import REGISTRY_RUNNERS
	# RunnerClass = REGISTRY_RUNNERS[os.getenv("APP_TYPE")]
	# with RunnerClass() as runner:
	# runner.to('cuda')
	# demo_lhm(infer_impl=runner.infer)

	facedetector = VGGHeadDetector(
	"./pretrained_models/gagatracker/vgghead/vgg_heads_l.trcd",
	device='cpu',
	)
	facedetector.to('cuda')

	pose_estimator = PoseEstimator(
	"./pretrained_models/human_model_files/", device='cpu'
	)
	pose_estimator.to('cuda')
	pose_estimator.device = 'cuda'

	cfg, cfg_train = parse_configs()
	lhm = _build_model(cfg)
	lhm.to('cuda')

	demo_lhm(pose_estimator, facedetector, lhm, cfg)

	if __name__ == '__main__':
	launch_pretrained()
	launch_env_not_compile_with_cuda()
	launch_gradio_app()