Spaces:

hujiecpp
/

PE3R

Running on Zero

App Files Files Community

PE3R / app.py

hujiecpp

init project

399850e 4 months ago

raw

history blame

24.8 kB

	import os
	import sys
	sys.path.append(os.path.abspath('./modules'))

	# import math
	import tempfile
	import gradio
	import torch
	import spaces
	import numpy as np
	import functools
	import trimesh
	import copy
	# from PIL import Image
	from scipy.spatial.transform import Rotation

	from modules.pe3r.images import Images

	from modules.dust3r.inference import inference
	from modules.dust3r.image_pairs import make_pairs
	from modules.dust3r.utils.image import load_images #, rgb
	from modules.dust3r.utils.device import to_numpy
	from modules.dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
	from modules.dust3r.cloud_opt import global_aligner, GlobalAlignerMode
	# from copy import deepcopy
	# import cv2
	# from typing import Any, Dict, Generator,List
	# import matplotlib.pyplot as pl

	# from modules.mobilesamv2.utils.transforms import ResizeLongestSide
	# from modules.pe3r.models import Models
	import torchvision.transforms as tvf

	# sys.path.append(os.path.abspath('./modules/ultralytics'))

	# from transformers import AutoTokenizer, AutoModel, AutoProcessor, SamModel
	# from modules.mast3r.model import AsymmetricMASt3R

	# from modules.sam2.build_sam import build_sam2_video_predictor
	# from modules.mobilesamv2.promt_mobilesamv2 import ObjectAwareModel
	# from modules.mobilesamv2 import sam_model_registry

	# from sam2.sam2_video_predictor import SAM2VideoPredictor
	from modules.mast3r.model import AsymmetricMASt3R


	silent = False

	# device = 'cpu' #'cuda' if torch.cuda.is_available() else 'cpu' # #
	# pe3r = Models('cpu') # 'cpu' #
	# print(device)

	def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
	cam_color=None, as_pointcloud=False,
	transparent_cams=False):
	assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
	pts3d = to_numpy(pts3d)
	imgs = to_numpy(imgs)
	focals = to_numpy(focals)
	cams2world = to_numpy(cams2world)

	scene = trimesh.Scene()

	# full pointcloud
	if as_pointcloud:
	pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
	col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
	pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
	scene.add_geometry(pct)
	else:
	meshes = []
	for i in range(len(imgs)):
	meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
	mesh = trimesh.Trimesh(**cat_meshes(meshes))
	scene.add_geometry(mesh)

	# add each camera
	for i, pose_c2w in enumerate(cams2world):
	if isinstance(cam_color, list):
	camera_edge_color = cam_color[i]
	else:
	camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
	add_scene_cam(scene, pose_c2w, camera_edge_color,
	None if transparent_cams else imgs[i], focals[i],
	imsize=imgs[i].shape[1::-1], screen_width=cam_size)

	rot = np.eye(4)
	rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
	scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
	outfile = os.path.join(outdir, 'scene.glb')
	if not silent:
	print('(exporting 3D scene to', outfile, ')')
	scene.export(file_obj=outfile)
	return outfile

	def get_3D_model_from_scene(outdir, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
	clean_depth=False, transparent_cams=False, cam_size=0.05):
	"""
	extract 3D_model (glb file) from a reconstructed scene
	"""
	if scene is None:
	return None
	# post processes
	if clean_depth:
	scene = scene.clean_pointcloud()
	if mask_sky:
	scene = scene.mask_sky()

	# get optimized values from scene
	rgbimg = scene.ori_imgs
	focals = scene.get_focals().cpu()
	cams2world = scene.get_im_poses().cpu()
	# 3D pointcloud from depthmap, poses and intrinsics
	pts3d = to_numpy(scene.get_pts3d())
	scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
	msk = to_numpy(scene.get_masks())
	return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
	transparent_cams=transparent_cams, cam_size=cam_size)

	# def mask_nms(masks, threshold=0.8):
	# keep = []
	# mask_num = len(masks)
	# suppressed = np.zeros((mask_num), dtype=np.int64)
	# for i in range(mask_num):
	# if suppressed[i] == 1:
	# continue
	# keep.append(i)
	# for j in range(i + 1, mask_num):
	# if suppressed[j] == 1:
	# continue
	# intersection = (masks[i] & masks[j]).sum()
	# if min(intersection / masks[i].sum(), intersection / masks[j].sum()) > threshold:
	# suppressed[j] = 1
	# return keep

	# def filter(masks, keep):
	# ret = []
	# for i, m in enumerate(masks):
	# if i in keep: ret.append(m)
	# return ret

	# def mask_to_box(mask):
	# if mask.sum() == 0:
	# return np.array([0, 0, 0, 0])

	# # Get the rows and columns where the mask is 1
	# rows = np.any(mask, axis=1)
	# cols = np.any(mask, axis=0)

	# # Get top, bottom, left, right edges
	# top = np.argmax(rows)
	# bottom = len(rows) - 1 - np.argmax(np.flip(rows))
	# left = np.argmax(cols)
	# right = len(cols) - 1 - np.argmax(np.flip(cols))

	# return np.array([left, top, right, bottom])

	# def box_xyxy_to_xywh(box_xyxy):
	# box_xywh = deepcopy(box_xyxy)
	# box_xywh[2] = box_xywh[2] - box_xywh[0]
	# box_xywh[3] = box_xywh[3] - box_xywh[1]
	# return box_xywh

	# def get_seg_img(mask, box, image):
	# image = image.copy()
	# x, y, w, h = box
	# # image[mask == 0] = np.array([0, 0, 0], dtype=np.uint8)
	# box_area = w * h
	# mask_area = mask.sum()
	# if 1 - (mask_area / box_area) < 0.2:
	# image[mask == 0] = np.array([0, 0, 0], dtype=np.uint8)
	# else:
	# random_values = np.random.randint(0, 255, size=image.shape, dtype=np.uint8)
	# image[mask == 0] = random_values[mask == 0]
	# seg_img = image[y:y+h, x:x+w, ...]
	# return seg_img

	# def pad_img(img):
	# h, w, _ = img.shape
	# l = max(w,h)
	# pad = np.zeros((l,l,3), dtype=np.uint8) #
	# if h > w:
	# pad[:,(h-w)//2:(h-w)//2 + w, :] = img
	# else:
	# pad[(w-h)//2:(w-h)//2 + h, :, :] = img
	# return pad

	# def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
	# assert len(args) > 0 and all(
	# len(a) == len(args[0]) for a in args
	# ), "Batched iteration must have inputs of all the same size."
	# n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
	# for b in range(n_batches):
	# yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]

	# def slerp(u1, u2, t):
	# """
	# Perform spherical linear interpolation (Slerp) between two unit vectors.

	# Args:
	# - u1 (torch.Tensor): First unit vector, shape (1024,)
	# - u2 (torch.Tensor): Second unit vector, shape (1024,)
	# - t (float): Interpolation parameter

	# Returns:
	# - torch.Tensor: Interpolated vector, shape (1024,)
	# """
	# # Compute the dot product
	# dot_product = torch.sum(u1 * u2)

	# # Ensure the dot product is within the valid range [-1, 1]
	# dot_product = torch.clamp(dot_product, -1.0, 1.0)

	# # Compute the angle between the vectors
	# theta = torch.acos(dot_product)

	# # Compute the coefficients for the interpolation
	# sin_theta = torch.sin(theta)
	# if sin_theta == 0:
	# # Vectors are parallel, return a linear interpolation
	# return u1 + t * (u2 - u1)

	# s1 = torch.sin((1 - t) * theta) / sin_theta
	# s2 = torch.sin(t * theta) / sin_theta

	# # Perform the interpolation
	# return s1 * u1 + s2 * u2

	# def slerp_multiple(vectors, t_values):
	# """
	# Perform spherical linear interpolation (Slerp) for multiple vectors.

	# Args:
	# - vectors (torch.Tensor): Tensor of vectors, shape (n, 1024)
	# - a_values (torch.Tensor): Tensor of values corresponding to each vector, shape (n,)

	# Returns:
	# - torch.Tensor: Interpolated vector, shape (1024,)
	# """
	# n = vectors.shape[0]

	# # Initialize the interpolated vector with the first vector
	# interpolated_vector = vectors[0]

	# # Perform Slerp iteratively
	# for i in range(1, n):
	# # Perform Slerp between the current interpolated vector and the next vector
	# t = t_values[i] / (t_values[i] + t_values[i-1])
	# interpolated_vector = slerp(interpolated_vector, vectors[i], t)

	# return interpolated_vector

	# @torch.no_grad
	# def get_mask_from_img_sam1(yolov8, mobilesamv2, sam1_image, yolov8_image, original_size, input_size, transform):

	# device = 'cuda' if torch.cuda.is_available() else 'cpu'


	# sam_mask=[]
	# img_area = original_size[0] * original_size[1]

	# obj_results = yolov8(yolov8_image,device=device,retina_masks=False,imgsz=1024,conf=0.25,iou=0.95,verbose=False)
	# input_boxes1 = obj_results[0].boxes.xyxy
	# input_boxes1 = input_boxes1.cpu().numpy()
	# input_boxes1 = transform.apply_boxes(input_boxes1, original_size)
	# input_boxes = torch.from_numpy(input_boxes1).to(device)

	# # obj_results = yolov8(yolov8_image,device=device,retina_masks=False,imgsz=512,conf=0.25,iou=0.9,verbose=False)
	# # input_boxes2 = obj_results[0].boxes.xyxy
	# # input_boxes2 = input_boxes2.cpu().numpy()
	# # input_boxes2 = transform.apply_boxes(input_boxes2, original_size)
	# # input_boxes2 = torch.from_numpy(input_boxes2).to(device)

	# # input_boxes = torch.cat((input_boxes1, input_boxes2), dim=0)

	# input_image = mobilesamv2.preprocess(sam1_image)
	# image_embedding = mobilesamv2.image_encoder(input_image)['last_hidden_state']

	# image_embedding=torch.repeat_interleave(image_embedding, 320, dim=0)
	# prompt_embedding=mobilesamv2.prompt_encoder.get_dense_pe()
	# prompt_embedding=torch.repeat_interleave(prompt_embedding, 320, dim=0)
	# for (boxes,) in batch_iterator(320, input_boxes):
	# with torch.no_grad():
	# image_embedding=image_embedding[0:boxes.shape[0],:,:,:]
	# prompt_embedding=prompt_embedding[0:boxes.shape[0],:,:,:]
	# sparse_embeddings, dense_embeddings = mobilesamv2.prompt_encoder(
	# points=None,
	# boxes=boxes,
	# masks=None,)
	# low_res_masks, _ = mobilesamv2.mask_decoder(
	# image_embeddings=image_embedding,
	# image_pe=prompt_embedding,
	# sparse_prompt_embeddings=sparse_embeddings,
	# dense_prompt_embeddings=dense_embeddings,
	# multimask_output=False,
	# simple_type=True,
	# )
	# low_res_masks=mobilesamv2.postprocess_masks(low_res_masks, input_size, original_size)
	# sam_mask_pre = (low_res_masks > mobilesamv2.mask_threshold)
	# for mask in sam_mask_pre:
	# if mask.sum() / img_area > 0.002:
	# sam_mask.append(mask.squeeze(1))
	# sam_mask=torch.cat(sam_mask)
	# sorted_sam_mask = sorted(sam_mask, key=(lambda x: x.sum()), reverse=True)
	# keep = mask_nms(sorted_sam_mask)
	# ret_mask = filter(sorted_sam_mask, keep)

	# return ret_mask

	# @torch.no_grad
	# def get_cog_feats(images, sam2, siglip, siglip_processor, yolov8, mobilesamv2):

	# device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# cog_seg_maps = []
	# rev_cog_seg_maps = []
	# inference_state = sam2.init_state(images=images.sam2_images, video_height=images.sam2_video_size[0], video_width=images.sam2_video_size[1])
	# mask_num = 0

	# sam1_images = images.sam1_images
	# sam1_images_size = images.sam1_images_size
	# np_images = images.np_images
	# np_images_size = images.np_images_size

	# sam1_masks = get_mask_from_img_sam1(yolov8, mobilesamv2, sam1_images[0], np_images[0], np_images_size[0], sam1_images_size[0], images.sam1_transform)
	# for mask in sam1_masks:
	# _, _, _ = sam2.add_new_mask(
	# inference_state=inference_state,
	# frame_idx=0,
	# obj_id=mask_num,
	# mask=mask,
	# )
	# mask_num += 1

	# video_segments = {} # video_segments contains the per-frame segmentation results
	# for out_frame_idx, out_obj_ids, out_mask_logits in sam2.propagate_in_video(inference_state):
	# sam2_masks = (out_mask_logits > 0.0).squeeze(1)

	# video_segments[out_frame_idx] = {
	# out_obj_id: sam2_masks[i].cpu().numpy()
	# for i, out_obj_id in enumerate(out_obj_ids)
	# }

	# if out_frame_idx == 0:
	# continue

	# sam1_masks = get_mask_from_img_sam1(yolov8, mobilesamv2, sam1_images[out_frame_idx], np_images[out_frame_idx], np_images_size[out_frame_idx], sam1_images_size[out_frame_idx], images.sam1_transform)

	# for sam1_mask in sam1_masks:
	# flg = 1
	# for sam2_mask in sam2_masks:
	# # print(sam1_mask.shape, sam2_mask.shape)
	# area1 = sam1_mask.sum()
	# area2 = sam2_mask.sum()
	# intersection = (sam1_mask & sam2_mask).sum()
	# if min(intersection / area1, intersection / area2) > 0.25:
	# flg = 0
	# break
	# if flg:
	# video_segments[out_frame_idx][mask_num] = sam1_mask.cpu().numpy()
	# mask_num += 1

	# multi_view_clip_feats = torch.zeros((mask_num+1, 1024))
	# multi_view_clip_feats_map = {}
	# multi_view_clip_area_map = {}
	# for now_frame in range(0, len(video_segments), 1):
	# image = np_images[now_frame]

	# seg_img_list = []
	# out_obj_id_list = []
	# out_obj_mask_list = []
	# out_obj_area_list = []
	# # NOTE: background: -1
	# rev_seg_map = -np.ones(image.shape[:2], dtype=np.int64)
	# sorted_dict_items = sorted(video_segments[now_frame].items(), key=lambda x: np.count_nonzero(x[1]), reverse=False)
	# for out_obj_id, mask in sorted_dict_items:
	# if mask.sum() == 0:
	# continue
	# rev_seg_map[mask] = out_obj_id
	# rev_cog_seg_maps.append(rev_seg_map)

	# seg_map = -np.ones(image.shape[:2], dtype=np.int64)
	# sorted_dict_items = sorted(video_segments[now_frame].items(), key=lambda x: np.count_nonzero(x[1]), reverse=True)
	# for out_obj_id, mask in sorted_dict_items:
	# if mask.sum() == 0:
	# continue
	# box = np.int32(box_xyxy_to_xywh(mask_to_box(mask)))

	# if box[2] == 0 and box[3] == 0:
	# continue
	# # print(box)
	# seg_img = get_seg_img(mask, box, image)
	# pad_seg_img = cv2.resize(pad_img(seg_img), (256,256))
	# seg_img_list.append(pad_seg_img)
	# seg_map[mask] = out_obj_id
	# out_obj_id_list.append(out_obj_id)
	# out_obj_area_list.append(np.count_nonzero(mask))
	# out_obj_mask_list.append(mask)

	# if len(seg_img_list) == 0:
	# cog_seg_maps.append(seg_map)
	# continue

	# seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
	# seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0

	# inputs = siglip_processor(images=seg_imgs, return_tensors="pt")
	# inputs = {key: value.to(device) for key, value in inputs.items()}

	# image_features = siglip.get_image_features(**inputs)
	# image_features = image_features / image_features.norm(dim=-1, keepdim=True)
	# image_features = image_features.detach().cpu()

	# for i in range(len(out_obj_mask_list)):
	# for j in range(i + 1, len(out_obj_mask_list)):
	# mask1 = out_obj_mask_list[i]
	# mask2 = out_obj_mask_list[j]
	# intersection = np.logical_and(mask1, mask2).sum()
	# area1 = out_obj_area_list[i]
	# area2 = out_obj_area_list[j]
	# if min(intersection / area1, intersection / area2) > 0.025:
	# conf1 = area1 / (area1 + area2)
	# # conf2 = area2 / (area1 + area2)
	# image_features[j] = slerp(image_features[j], image_features[i], conf1)

	# for i, clip_feat in enumerate(image_features):
	# id = out_obj_id_list[i]
	# if id in multi_view_clip_feats_map.keys():
	# multi_view_clip_feats_map[id].append(clip_feat)
	# multi_view_clip_area_map[id].append(out_obj_area_list[i])
	# else:
	# multi_view_clip_feats_map[id] = [clip_feat]
	# multi_view_clip_area_map[id] = [out_obj_area_list[i]]

	# cog_seg_maps.append(seg_map)
	# del image_features

	# for i in range(mask_num):
	# if i in multi_view_clip_feats_map.keys():
	# clip_feats = multi_view_clip_feats_map[i]
	# mask_area = multi_view_clip_area_map[i]
	# multi_view_clip_feats[i] = slerp_multiple(torch.stack(clip_feats), np.stack(mask_area))
	# else:
	# multi_view_clip_feats[i] = torch.zeros((1024))
	# multi_view_clip_feats[mask_num] = torch.zeros((1024))

	# return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats


	@spaces.GPU(duration=30)
	def get_reconstructed_scene(outdir, filelist, schedule='linear', niter=300, min_conf_thr=3.0,
	as_pointcloud=True, mask_sky=False, clean_depth=True, transparent_cams=True, cam_size=0.05,
	scenegraph_type='complete', winsize=1, refid=0):
	"""
	from a list of images, run dust3r inference, global aligner.
	then run get_3D_model_from_scene
	"""

	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	MAST3R_CKP = 'naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric'
	mast3r = AsymmetricMASt3R.from_pretrained(MAST3R_CKP).to(device)

	# sam2 = SAM2VideoPredictor.from_pretrained('facebook/sam2.1-hiera-large', device=device)

	# siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
	# siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256")

	# SAM1_DECODER_CKP = './checkpoints/Prompt_guided_Mask_Decoder.pt'
	# mobilesamv2 = sam_model_registry['sam_vit_h'](None)
	# sam1 = SamModel.from_pretrained('facebook/sam-vit-huge')
	# image_encoder = sam1.vision_encoder

	# prompt_encoder, mask_decoder = sam_model_registry['prompt_guided_decoder'](SAM1_DECODER_CKP)
	# mobilesamv2.prompt_encoder = prompt_encoder
	# mobilesamv2.mask_decoder = mask_decoder
	# mobilesamv2.image_encoder=image_encoder
	# mobilesamv2.to(device=device)
	# mobilesamv2.eval()

	# YOLO8_CKP='./checkpoints/ObjectAwareModel.pt'
	# yolov8 = ObjectAwareModel(YOLO8_CKP)

	if len(filelist) < 2:
	raise gradio.Error("Please input at least 2 images.")

	images = Images(filelist=filelist, device=device)

	# try:
	# cog_seg_maps, rev_cog_seg_maps, cog_feats = get_cog_feats(images, sam2, siglip, siglip_processor, yolov8, mobilesamv2)
	# imgs = load_images(images, rev_cog_seg_maps, size=512, verbose=not silent)
	# except Exception as e:
	rev_cog_seg_maps = []
	for tmp_img in images.np_images:
	rev_seg_map = -np.ones(tmp_img.shape[:2], dtype=np.int64)
	rev_cog_seg_maps.append(rev_seg_map)
	cog_seg_maps = rev_cog_seg_maps
	cog_feats = torch.zeros((1, 1024))
	imgs = load_images(images, rev_cog_seg_maps, size=512, verbose=not silent)

	if len(imgs) == 1:
	imgs = [imgs[0], copy.deepcopy(imgs[0])]
	imgs[1]['idx'] = 1

	if scenegraph_type == "swin":
	scenegraph_type = scenegraph_type + "-" + str(winsize)
	elif scenegraph_type == "oneref":
	scenegraph_type = scenegraph_type + "-" + str(refid)

	pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
	output = inference(pairs, mast3r, device, batch_size=1, verbose=not silent)
	mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
	scene_1 = global_aligner(output, cog_seg_maps, rev_cog_seg_maps, cog_feats, device=device, mode=mode, verbose=not silent)
	lr = 0.01
	# if mode == GlobalAlignerMode.PointCloudOptimizer:
	loss = scene_1.compute_global_alignment(tune_flg=True, init='mst', niter=niter, schedule=schedule, lr=lr)

	try:
	ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
	for i in range(len(imgs)):
	# print(imgs[i]['img'].shape, scene.imgs[i].shape, ImgNorm(scene.imgs[i])[None])
	imgs[i]['img'] = ImgNorm(scene_1.imgs[i])[None]
	pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
	output = inference(pairs, mast3r, device, batch_size=1, verbose=not silent)
	mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
	scene = global_aligner(output, cog_seg_maps, rev_cog_seg_maps, cog_feats, device=device, mode=mode, verbose=not silent)
	ori_imgs = scene.ori_imgs
	lr = 0.01
	# if mode == GlobalAlignerMode.PointCloudOptimizer:
	loss = scene.compute_global_alignment(tune_flg=False, init='mst', niter=niter, schedule=schedule, lr=lr)
	except Exception as e:
	scene = scene_1
	scene.imgs = ori_imgs
	scene.ori_imgs = ori_imgs
	print(e)

	outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
	clean_depth, transparent_cams, cam_size)

	scene.to('cpu')
	torch.cuda.empty_cache()
	return scene, outfile

	# @spaces.GPU(duration=30)
	# def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
	# mask_sky, clean_depth, transparent_cams, cam_size):

	# device = 'cuda' if torch.cuda.is_available() else 'cpu'
	# siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
	# siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)

	# texts = [text]
	# inputs = siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
	# inputs = {key: value.to(device) for key, value in inputs.items()}
	# with torch.no_grad():
	# text_feats =siglip.get_text_features(**inputs)
	# text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
	# scene.render_image(text_feats, threshold)
	# scene.ori_imgs = scene.rendered_imgs
	# outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
	# clean_depth, transparent_cams, cam_size)
	# return outfile

	tmpdirname = tempfile.mkdtemp(suffix='pe3r_gradio_demo')

	recon_fun = functools.partial(get_reconstructed_scene, tmpdirname)
	# model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname)
	# get_3D_object_from_scene_fun = functools.partial(get_3D_object_from_scene, tmpdirname)

	with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="PE3R Demo") as demo:
	# scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
	scene = gradio.State(None)
	gradio.HTML('<h2 style="text-align: center;">PE3R Demo</h2>')
	with gradio.Column():
	inputfiles = gradio.File(file_count="multiple")

	run_btn = gradio.Button("Reconstruct")

	with gradio.Row():
	text_input = gradio.Textbox(label="Query Text")
	threshold = gradio.Slider(label="Threshold", value=0.85, minimum=0.0, maximum=1.0, step=0.01)

	find_btn = gradio.Button("Find")

	outmodel = gradio.Model3D()
	# events

	run_btn.click(fn=recon_fun,
	inputs=[inputfiles],
	outputs=[scene, outmodel]) # , outgallery

	# find_btn.click(fn=get_3D_object_from_scene_fun,
	# inputs=[text_input, threshold, scene, min_conf_thr, as_pointcloud, mask_sky,
	# clean_depth, transparent_cams, cam_size],
	# outputs=outmodel)
	demo.launch(show_error=True, share=None, server_name=None, server_port=None)