hujiecpp commited on
Commit
9225e86
·
1 Parent(s): d1813b6

init project

Browse files
Files changed (2) hide show
  1. app.py +25 -14
  2. modules/pe3r/models.py +3 -3
app.py CHANGED
@@ -37,10 +37,12 @@ from modules.mobilesamv2.utils.transforms import ResizeLongestSide
37
  from modules.pe3r.models import Models
38
  import torchvision.transforms as tvf
39
 
 
 
40
  silent = False
41
- device = 'cuda' if torch.cuda.is_available() else 'cpu' #'cpu' #
42
- pe3r = Models(device) #
43
- print(device)
44
 
45
  def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
46
  cam_color=None, as_pointcloud=False,
@@ -245,7 +247,9 @@ def slerp_multiple(vectors, t_values):
245
  @torch.no_grad
246
  def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size, transform):
247
 
248
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
249
 
250
  sam_mask=[]
251
  img_area = original_size[0] * original_size[1]
@@ -301,7 +305,10 @@ def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size,
301
  @torch.no_grad
302
  def get_cog_feats(images):
303
 
304
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
 
305
 
306
  cog_seg_maps = []
307
  rev_cog_seg_maps = []
@@ -395,10 +402,10 @@ def get_cog_feats(images):
395
  seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
396
  seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0
397
 
398
- inputs = pe3r.siglip_processor(images=seg_imgs, return_tensors="pt")
399
  inputs = {key: value.to(device) for key, value in inputs.items()}
400
 
401
- image_features = pe3r.siglip.get_image_features(**inputs)
402
  image_features = image_features / image_features.norm(dim=-1, keepdim=True)
403
  image_features = image_features.detach().cpu()
404
 
@@ -438,7 +445,7 @@ def get_cog_feats(images):
438
  return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats
439
 
440
 
441
- @spaces.GPU(duration=120)
442
  def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
443
  as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
444
  scenegraph_type, winsize, refid):
@@ -447,7 +454,9 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
447
  then run get_3D_model_from_scene
448
  """
449
 
450
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
451
 
452
  if len(filelist) < 2:
453
  raise gradio.Error("Please input at least 2 images.")
@@ -505,22 +514,24 @@ def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
505
  outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
506
  clean_depth, transparent_cams, cam_size)
507
 
508
- scene.to('cpu')
509
  torch.cuda.empty_cache()
510
 
511
  return scene, outfile
512
 
513
- @spaces.GPU(duration=120)
514
  def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
515
  mask_sky, clean_depth, transparent_cams, cam_size):
516
 
517
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
518
 
519
  texts = [text]
520
- inputs = pe3r.siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
521
  inputs = {key: value.to(device) for key, value in inputs.items()}
522
  with torch.no_grad():
523
- text_feats =pe3r.siglip.get_text_features(**inputs)
524
  text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
525
  scene.render_image(text_feats, threshold)
526
  scene.ori_imgs = scene.rendered_imgs
 
37
  from modules.pe3r.models import Models
38
  import torchvision.transforms as tvf
39
 
40
+ from transformers import AutoTokenizer, AutoModel, AutoProcessor
41
+
42
  silent = False
43
+ # device = 'cuda' if torch.cuda.is_available() else 'cpu' #'cpu' #
44
+ pe3r = Models('cpu') #
45
+ # print(device)
46
 
47
  def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
48
  cam_color=None, as_pointcloud=False,
 
247
  @torch.no_grad
248
  def get_mask_from_img_sam1(sam1_image, yolov8_image, original_size, input_size, transform):
249
 
250
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
251
+ pe3r.yolov8.to(device)
252
+ pe3r.mobilesamv2.to(device)
253
 
254
  sam_mask=[]
255
  img_area = original_size[0] * original_size[1]
 
305
  @torch.no_grad
306
  def get_cog_feats(images):
307
 
308
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
309
+ pe3r.sam2.to(device)
310
+ siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
311
+ siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256")
312
 
313
  cog_seg_maps = []
314
  rev_cog_seg_maps = []
 
402
  seg_imgs = np.stack(seg_img_list, axis=0) # b,H,W,3
403
  seg_imgs = torch.from_numpy(seg_imgs).permute(0,3,1,2) # / 255.0
404
 
405
+ inputs = siglip_processor(images=seg_imgs, return_tensors="pt")
406
  inputs = {key: value.to(device) for key, value in inputs.items()}
407
 
408
+ image_features = siglip.get_image_features(**inputs)
409
  image_features = image_features / image_features.norm(dim=-1, keepdim=True)
410
  image_features = image_features.detach().cpu()
411
 
 
445
  return cog_seg_maps, rev_cog_seg_maps, multi_view_clip_feats
446
 
447
 
448
+ @spaces.GPU(duration=60)
449
  def get_reconstructed_scene(outdir, filelist, schedule, niter, min_conf_thr,
450
  as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
451
  scenegraph_type, winsize, refid):
 
454
  then run get_3D_model_from_scene
455
  """
456
 
457
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
458
+
459
+ pe3r.mast3r.to(device)
460
 
461
  if len(filelist) < 2:
462
  raise gradio.Error("Please input at least 2 images.")
 
514
  outfile = get_3D_model_from_scene(outdir, scene, min_conf_thr, as_pointcloud, mask_sky,
515
  clean_depth, transparent_cams, cam_size)
516
 
517
+ # scene.to('cpu')
518
  torch.cuda.empty_cache()
519
 
520
  return scene, outfile
521
 
522
+ # @spaces.GPU(duration=60)
523
  def get_3D_object_from_scene(outdir, text, threshold, scene, min_conf_thr, as_pointcloud,
524
  mask_sky, clean_depth, transparent_cams, cam_size):
525
 
526
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
527
+ siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
528
+ siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
529
 
530
  texts = [text]
531
+ inputs = siglip_tokenizer(text=texts, padding="max_length", return_tensors="pt")
532
  inputs = {key: value.to(device) for key, value in inputs.items()}
533
  with torch.no_grad():
534
+ text_feats =siglip.get_text_features(**inputs)
535
  text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
536
  scene.render_image(text_feats, threshold)
537
  scene.ori_imgs = scene.rendered_imgs
modules/pe3r/models.py CHANGED
@@ -47,6 +47,6 @@ class Models:
47
  self.yolov8 = ObjectAwareModel(YOLO8_CKP)
48
 
49
  # -- siglip --
50
- self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
51
- self.siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256", device_map=device)
52
- self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256", device_map=device)
 
47
  self.yolov8 = ObjectAwareModel(YOLO8_CKP)
48
 
49
  # -- siglip --
50
+ # self.siglip = AutoModel.from_pretrained("google/siglip-large-patch16-256", device_map=device)
51
+ # self.siglip_tokenizer = AutoTokenizer.from_pretrained("google/siglip-large-patch16-256")
52
+ # self.siglip_processor = AutoProcessor.from_pretrained("google/siglip-large-patch16-256")