Spaces:
Runtime error
Runtime error
File size: 6,621 Bytes
9de012e 0cfc205 9de012e 0cfc205 9de012e 0cfc205 9de012e 0cfc205 9de012e f046265 9de012e 55cc62e 9de012e 55cc62e 9de012e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import gradio as gr
import os
import re
import spaces
from leo.inference import inference
MESH_DIR = 'assets/mesh'
MESH_NAMES = sorted([os.path.splitext(fname)[0] for fname in os.listdir(MESH_DIR)])
STEP_COUNTS = 6
def change_scene(dropdown_scene: str):
# reset 3D scene and chatbot history
return os.path.join(MESH_DIR, f'{dropdown_scene}.glb')
with gr.Blocks(title='LEO Demo') as demo:
gr.HTML(value="<h1 align='center'>Task-oriented Sequential Grounding in 3D Scenes </h1>")
with gr.Row():
with gr.Column(scale=5):
dropdown_scene = gr.Dropdown(
choices=MESH_NAMES,
value='scene0050_00',
interactive=True,
label='Select a 3D scene',
)
model_3d = gr.Model3D(
value=os.path.join(MESH_DIR, f'scene0050_00.glb'),
clear_color=[0.0, 0.0, 0.0, 0.0],
label='3D Scene',
camera_position=(80, 100, 6),
height=659,
)
gr.HTML(
"""<center><strong>
👆 SCROLL and DRAG on the 3D Scene
to zoom in/out and rotate. Press CTRL and DRAG to pan.
</strong></center>
"""
)
dropdown_scene.change(
fn=change_scene,
inputs=[dropdown_scene],
outputs=[model_3d],
queue=False
)
# LEO task-to-plan inference wrapper
@spaces.GPU
def leo_task_to_plan(task_description):
task_input = {
"task_description": task_description,
"scan_id": "scene0050_00"
}
plan = inference("scene0050_00", task_input, predict_mode=True)
plan = plan[0]['pred_plan_text']
# parts = re.split(r'(\d+\.)', plan)[1:]
# steps = [parts[i] + parts[i + 1].rstrip() for i in range(0, len(parts), 2)]
return plan
# LEO ground inference wrapper
@spaces.GPU
def leo_plan_to_masks(task_description, *action_steps):
formatted_action_steps = [
{"action": step, "target_id": "unknown", "label": "unknown"} for step in action_steps if step != ""
]
task_input = {
"task_description": task_description,
"action_steps": formatted_action_steps,
"scan_id": "scene0050_00"
}
masks = inference("scene0050_00", task_input, predict_mode=False)
masks = [tensor.item() for tensor in masks]
return [f"assets/mask/scene0050_00/scene0050_00_obj_{mask}.glb" for mask in masks] + ["assets/mask/scene0050_00/scene0050_00_obj_empty.glb"] * (STEP_COUNTS - len(masks))
# LEO task-to-plan and ground inference wrapper
@spaces.GPU
def leo_task_to_plan_and_masks(task_description):
task_input = {
"task_description": task_description,
"scan_id": "scene0050_00"
}
plan = inference("scene0050_00", task_input, predict_mode=True)
plan_text = plan[0]['pred_plan_text']
parts = re.split(r'(\d+\.)', plan_text)[1:]
steps = [parts[i] + parts[i + 1].rstrip() for i in range(0, len(parts), 2)]
steps += ["### PLANNING HAS ENDED, SEE ABOVE FOR DETAILS ###"] * (STEP_COUNTS - len(steps))
masks = plan[0]['predict_object_id']
mask_paths = [f"assets/mask/scene0050_00/scene0050_00_obj_{mask}.glb" for mask in masks]
mask_paths += ["assets/mask/scene0050_00/scene0050_00_obj_empty.glb"] * (STEP_COUNTS - len(masks)) # fill with empty mask
output = []
for i in range(STEP_COUNTS):
output.append(steps[i])
output.append(mask_paths[i])
return output
# with gr.Tab("LEO Task-to-Plan"):
# gr.Interface(
# fn=leo_task_to_plan,
# inputs=[gr.Textbox(label="Task Description")],
# outputs=["text"],
# examples=[
# ["Freshen up in the bathroom."]
# ],
# title="LEO Task-to-Plan: Input task, Output plan text"
# )
with gr.Tab("LEO Plan-to-Masks"):
gr.Interface(
fn=leo_plan_to_masks,
inputs=[gr.Textbox(label="Task Description")] + [gr.Textbox(label=f"Action Step {i+1}") for i in range(STEP_COUNTS)],
outputs=[gr.Model3D(
clear_color=[0.0, 0.0, 0.0, 0.0], camera_position=(80, 100, 6), label=f"3D Model for Step {i+1} (if the step exists)") for i in range(STEP_COUNTS)],
examples=[
["Start Working at the desk.", "1. Walk to the desk.", "2. Sit on the brown leather sofa chair in front of the desk.", "3. Turn on the opened laptop in front of you on the desk.", "4. Grab the cup beside the laptop to drink."] + [""] * (STEP_COUNTS - 4)
],
title="LEO Plan-to-Masks: Input plan, Output 3D Masks for each step, Red denotes predicted target object"
)
with gr.Tab("LEO Task-to-Plan and Masks"):
gr.Interface(
fn=leo_task_to_plan_and_masks,
inputs=[gr.Textbox(label="Task Description")],
outputs=[
item for sublist in zip(
[gr.Textbox(label=f"Action Step {i+1}") for i in range(STEP_COUNTS)],
[gr.Model3D(
clear_color=[0.0, 0.0, 0.0, 0.0],
camera_position=(80, 100, 6),
label=f"3D Model for Step {i+1} (if the step exists)"
) for i in range(STEP_COUNTS)]
) for item in sublist
],
examples=[
["Start Working at the desk."]
],
title="LEO Task-to-Plan and Masks: Input task, Output plan text and 3D Masks for each step, Red denotes predicted target object",
# js="""
# function() {
# const stepCounts = """ + str(STEP_COUNTS) + """;
# const stepElems = document.querySelectorAll('.output_interface .textbox_output');
# const modelElems = document.querySelectorAll('.output_interface .model3d_output');
# for (let i = 0; i < stepCounts; i++) {
# if (stepElems[i].value === '### PLANNING HAS ENDED, SEE ABOVE FOR DETAILS ###' || modelElems[i].src.includes('scene0050_00_obj_empty.glb')) {
# stepElems[i].style.display = 'none';
# modelElems[i].style.display = 'none';
# }
# }
# }
# """
)
demo.queue().launch(share=True, allowed_paths=['assets'])
|