Bachmann Roman Christian commited on
Commit
758da21
·
1 Parent(s): 100478a

Changed number of tokens to percentages of tokens

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -292,6 +292,11 @@ def plot_predictions(input_dict, preds, masks, image_size=224):
292
 
293
 
294
  def inference(img, num_tokens, manual_mode, num_rgb, num_depth, num_semseg, seed):
 
 
 
 
 
295
  im = Image.open(img)
296
 
297
  # Center crop and resize RGB
@@ -367,7 +372,7 @@ title = "MultiMAE"
367
  description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
368
  Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
369
  Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
370
- Choose the number of visible tokens using the sliders below and see how MultiMAE reconstructs the modalities!"
371
 
372
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
373
  target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
@@ -381,20 +386,20 @@ os.system("wget https://i.imgur.com/KTKgYKi.jpg")
381
  os.system("wget https://i.imgur.com/lWYuRI7.jpg")
382
 
383
  examples = [
384
- ['c9ObJdK.jpg', 98, False, 32, 32, 32, 0],
385
- ['KTKgYKi.jpg', 98, False, 32, 32, 32, 0],
386
- ['lWYuRI7.jpg', 98, False, 32, 32, 32, 0],
387
  ]
388
 
389
  gr.Interface(
390
  fn=inference,
391
  inputs=[
392
  gr.inputs.Image(label='RGB input image', type='filepath'),
393
- gr.inputs.Slider(label='Number of input tokens', default=98, step=1, minimum=0, maximum=588),
394
  gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
395
- gr.inputs.Slider(label='Number of RGB input tokens (for manual mode only)', default=32, step=1, minimum=0, maximum=196),
396
- gr.inputs.Slider(label='Number of depth input tokens (for manual mode only)', default=32, step=1, minimum=0, maximum=196),
397
- gr.inputs.Slider(label='Number of semantic input tokens (for manual mode only)', default=32, step=1, minimum=0, maximum=196),
398
  gr.inputs.Number(label='Random seed: Change this to sample different masks (for manual mode only)', default=0),
399
  ],
400
  outputs=[
 
292
 
293
 
294
  def inference(img, num_tokens, manual_mode, num_rgb, num_depth, num_semseg, seed):
295
+ num_tokens = int(588 * num_tokens / 100.0)
296
+ num_rgb = int(196 * num_rgb / 100.0)
297
+ num_depth = int(196 * num_depth / 100.0)
298
+ num_semseg = int(196 * num_semseg / 100.0)
299
+
300
  im = Image.open(img)
301
 
302
  # Center crop and resize RGB
 
372
  description = "Gradio demo for MultiMAE: Multi-modal Multi-task Masked Autoencoders. \
373
  Upload your own images or try one of the examples below to explore the multi-modal masked reconstruction of a pre-trained MultiMAE model. \
374
  Uploaded images are pseudo labeled using a DPT trained on Omnidata depth, and a Mask2Former trained on COCO. \
375
+ Choose the percentage of visible tokens using the sliders below and see how MultiMAE reconstructs the modalities!"
376
 
377
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.01678' \
378
  target='_blank'>MultiMAE: Multi-modal Multi-task Masked Autoencoders</a> | \
 
386
  os.system("wget https://i.imgur.com/lWYuRI7.jpg")
387
 
388
  examples = [
389
+ ['c9ObJdK.jpg', 15, False, 15, 15, 15, 0],
390
+ ['KTKgYKi.jpg', 15, False, 15, 15, 15, 0],
391
+ ['lWYuRI7.jpg', 15, False, 15, 15, 15, 0],
392
  ]
393
 
394
  gr.Interface(
395
  fn=inference,
396
  inputs=[
397
  gr.inputs.Image(label='RGB input image', type='filepath'),
398
+ gr.inputs.Slider(label='Percentage of input tokens', default=15, step=0.1, minimum=0, maximum=100),
399
  gr.inputs.Checkbox(label='Manual mode: Check this to manually set the number of input tokens per modality using the sliders below', default=False),
400
+ gr.inputs.Slider(label='Percentage of RGB input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
401
+ gr.inputs.Slider(label='Percentage of depth input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
402
+ gr.inputs.Slider(label='Percentage of semantic input tokens (for manual mode only)', default=15, step=0.1, minimum=0, maximum=100),
403
  gr.inputs.Number(label='Random seed: Change this to sample different masks (for manual mode only)', default=0),
404
  ],
405
  outputs=[