init project
Browse files- app.py +81 -33
- modules/pe3r/__pycache__/models.cpython-312.pyc +0 -0
app.py
CHANGED
@@ -43,6 +43,7 @@ from modules.mobilesamv2 import sam_model_registry
|
|
43 |
from sam2.sam2_video_predictor import SAM2VideoPredictor
|
44 |
from modules.mast3r.model import AsymmetricMASt3R
|
45 |
|
|
|
46 |
|
47 |
silent = False
|
48 |
|
@@ -448,6 +449,44 @@ def get_cog_feats(images, sam2, siglip, siglip_processor, yolov8, mobilesamv2):
|
|
448 |
return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats
|
449 |
|
450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
@spaces.GPU(duration=120)
|
452 |
def get_reconstructed_scene(outdir, filelist, schedule='linear', niter=300, min_conf_thr=3.0,
|
453 |
as_pointcloud=True, mask_sky=False, clean_depth=True, transparent_cams=True, cam_size=0.05,
|
@@ -540,37 +579,46 @@ def get_reconstructed_scene(outdir, filelist, schedule='linear', niter=300, min_
|
|
540 |
|
541 |
torch.cuda.empty_cache()
|
542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
|
544 |
-
return outfile
|
545 |
-
|
546 |
-
# def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr=3.0, as_pointcloud=True,
|
547 |
-
# mask_sky=False, clean_depth=True, transparent_cams=True, cam_size=0.05):
|
548 |
-
|
549 |
-
# device = 'cpu'
|
550 |
-
# siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
|
551 |
-
# siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
552 |
-
|
553 |
-
# texts = [text]
|
554 |
-
# inputs = siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
|
555 |
-
# inputs = {key: value.to(device) for key, value in inputs.items()}
|
556 |
-
# with torch.no_grad():
|
557 |
-
# text_feats =siglip.get_text_features(**inputs)
|
558 |
-
# text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
|
559 |
-
# scene.render_image(text_feats, threshold)
|
560 |
-
# scene.ori_imgs = scene.rendered_imgs
|
561 |
-
|
562 |
|
563 |
-
# rgbimg = scene.ori_imgs
|
564 |
-
# focals = scene.get_focals().cpu()
|
565 |
-
# cams2world = scene.get_im_poses().cpu()
|
566 |
-
# # 3D pointcloud from depthmap, poses and intrinsics
|
567 |
-
# pts3d = to_numpy(scene.get_pts3d())
|
568 |
-
# scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
|
569 |
-
# msk = to_numpy(scene.get_masks())
|
570 |
-
# return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
|
571 |
-
# transparent_cams=transparent_cams, cam_size=cam_size)
|
572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
|
575 |
|
576 |
|
@@ -579,11 +627,11 @@ tmpdirname = tempfile.mkdtemp(suffix='pe3r_gradio_demo')
|
|
579 |
|
580 |
recon_fun = functools.partial(get_reconstructed_scene, tmpdirname)
|
581 |
# model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname)
|
582 |
-
|
583 |
|
584 |
with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="PE3R Demo") as demo:
|
585 |
# scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
|
586 |
-
|
587 |
|
588 |
gradio.HTML('<h2 style="text-align: center;">PE3R Demo</h2>')
|
589 |
with gradio.Column():
|
@@ -602,9 +650,9 @@ with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 10
|
|
602 |
|
603 |
run_btn.click(fn=recon_fun,
|
604 |
inputs=[inputfiles],
|
605 |
-
outputs=[outmodel]) # , outgallery,
|
606 |
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
demo.launch(show_error=True, share=None, server_name=None, server_port=None)
|
|
|
43 |
from sam2.sam2_video_predictor import SAM2VideoPredictor
|
44 |
from modules.mast3r.model import AsymmetricMASt3R
|
45 |
|
46 |
+
from torch.nn.functional import cosine_similarity
|
47 |
|
48 |
silent = False
|
49 |
|
|
|
449 |
return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats
|
450 |
|
451 |
|
452 |
+
class Scene_cpu:
|
453 |
+
def __init__(self, fix_imgs, cogs, focals, cams2world, pts3d, min_conf_thr, msk):
|
454 |
+
self.fix_imgs = fix_imgs
|
455 |
+
self.cogs = cogs
|
456 |
+
self.focals = focals
|
457 |
+
self.cams2world = cams2world
|
458 |
+
self.pts3d = pts3d
|
459 |
+
self.min_conf_thr = min_conf_thr
|
460 |
+
self.msk = msk
|
461 |
+
|
462 |
+
def render_image(self, text_feats, threshold=0.85):
|
463 |
+
self.rendered_imgs = []
|
464 |
+
# Collect all cosine similarities to compute min-max normalization
|
465 |
+
all_similarities = []
|
466 |
+
for each_cog in self.cogs:
|
467 |
+
similarity_map = cosine_similarity(each_cog, text_feats.unsqueeze(1), dim=-1)
|
468 |
+
all_similarities.append(similarity_map.squeeze().numpy())
|
469 |
+
# Flatten and normalize all similarities
|
470 |
+
total_similarities = np.concatenate(all_similarities)
|
471 |
+
min_sim, max_sim = total_similarities.min(), total_similarities.max()
|
472 |
+
normalized_similarities = [(sim - min_sim) / (max_sim - min_sim) for sim in all_similarities]
|
473 |
+
# Process each image with normalized similarities
|
474 |
+
for i, (each_cog, heatmap) in enumerate(zip(self.cogs, normalized_similarities)):
|
475 |
+
mask = heatmap > threshold
|
476 |
+
# Scale heatmap for visualization
|
477 |
+
heatmap = np.uint8(255 * heatmap)
|
478 |
+
heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
|
479 |
+
# Prepare image
|
480 |
+
image = self.fix_imgs[i]
|
481 |
+
image = image * 255.0
|
482 |
+
image = np.clip(image, 0, 255).astype(np.uint8)
|
483 |
+
# Apply mask and overlay heatmap with red RGB for masked areas
|
484 |
+
mask_indices = np.where(mask) # Get indices where mask is True
|
485 |
+
heatmap_color[mask_indices[0], mask_indices[1]] = [0, 0, 255] # Red color for masked regions
|
486 |
+
superimposed_img = np.where(np.expand_dims(mask, axis=-1), heatmap_color, image) / 255.0
|
487 |
+
self.rendered_imgs.append(superimposed_img)
|
488 |
+
|
489 |
+
|
490 |
@spaces.GPU(duration=120)
|
491 |
def get_reconstructed_scene(outdir, filelist, schedule='linear', niter=300, min_conf_thr=3.0,
|
492 |
as_pointcloud=True, mask_sky=False, clean_depth=True, transparent_cams=True, cam_size=0.05,
|
|
|
579 |
|
580 |
torch.cuda.empty_cache()
|
581 |
|
582 |
+
fix_imgs = []
|
583 |
+
for img in scene.fix_imgs:
|
584 |
+
fix_imgs.append(img)
|
585 |
+
cogs = []
|
586 |
+
for cog in scene.cogs:
|
587 |
+
cog_cpu = cog.cpu()
|
588 |
+
cogs.append(cog_cpu)
|
589 |
+
focals = scene.get_focals().cpu()
|
590 |
+
cams2world = scene.get_im_poses().cpu()
|
591 |
+
pts3d = to_numpy(scene.get_pts3d())
|
592 |
+
min_conf_thr = float(scene.conf_trf(torch.tensor(3.0)))
|
593 |
+
msk = to_numpy(scene.get_masks())
|
594 |
+
scene_cpu = Scene_cpu(fix_imgs, cogs, focals, cams2world, pts3d, min_conf_thr, msk)
|
595 |
|
596 |
+
return scene_cpu, outfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
597 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
|
599 |
+
def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr=3.0, as_pointcloud=True,
|
600 |
+
mask_sky=False, clean_depth=True, transparent_cams=True, cam_size=0.05):
|
601 |
+
|
602 |
+
device = 'cpu'
|
603 |
+
siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
|
604 |
+
siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
|
605 |
|
606 |
+
texts = [text]
|
607 |
+
inputs = siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
|
608 |
+
inputs = {key: value.to(device) for key, value in inputs.items()}
|
609 |
+
with torch.no_grad():
|
610 |
+
text_feats =siglip.get_text_features(**inputs)
|
611 |
+
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
|
612 |
+
scene.render_image(text_feats, threshold)
|
613 |
+
scene.ori_imgs = scene.rendered_imgs
|
614 |
+
rgbimg = scene.ori_imgs
|
615 |
+
focals = scene.focals
|
616 |
+
cams2world = scene.cams2world
|
617 |
+
# 3D pointcloud from depthmap, poses and intrinsics
|
618 |
+
pts3d = scene.pts3d
|
619 |
+
msk = scene.msk
|
620 |
+
return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
|
621 |
+
transparent_cams=transparent_cams, cam_size=cam_size)
|
622 |
|
623 |
|
624 |
|
|
|
627 |
|
628 |
recon_fun = functools.partial(get_reconstructed_scene, tmpdirname)
|
629 |
# model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname)
|
630 |
+
get_3D_object_from_scene_fun = functools.partial(get_3D_object_from_scene, tmpdirname)
|
631 |
|
632 |
with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="PE3R Demo") as demo:
|
633 |
# scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
|
634 |
+
scene = gradio.State(None)
|
635 |
|
636 |
gradio.HTML('<h2 style="text-align: center;">PE3R Demo</h2>')
|
637 |
with gradio.Column():
|
|
|
650 |
|
651 |
run_btn.click(fn=recon_fun,
|
652 |
inputs=[inputfiles],
|
653 |
+
outputs=[scene, outmodel]) # , outgallery, ,
|
654 |
|
655 |
+
find_btn.click(fn=get_3D_object_from_scene_fun,
|
656 |
+
inputs=[text_input, threshold, scene],
|
657 |
+
outputs=outmodel)
|
658 |
demo.launch(show_error=True, share=None, server_name=None, server_port=None)
|
modules/pe3r/__pycache__/models.cpython-312.pyc
CHANGED
Binary files a/modules/pe3r/__pycache__/models.cpython-312.pyc and b/modules/pe3r/__pycache__/models.cpython-312.pyc differ
|
|