import gradio as gr import spaces import gradio as gr import numpy as np from PIL import Image import cv2 from sim.simulator import GenieSimulator import os if not os.path.exists("data/mar_ckpt/langtable"): # download from google drive import gdown gdown.download_folder("https://drive.google.com/drive/u/2/folders/1XU87cRqV-IMZA6RLiabIR_uZngynvUFN") os.system("mkdir -p data/mar_ckpt/; mv langtable data/mar_ckpt/") RES = 512 PROMPT_HORIZON = 3 IMAGE_DIR = "sim/assets/langtable_prompt/" # Load available images available_images = sorted([img for img in os.listdir(IMAGE_DIR) if img.endswith(".png")]) # Helper function to reset GenieSimulator with the selected image @spaces.GPU def initialize_simulator(image_name, genie): image_path = os.path.join(IMAGE_DIR, image_name) image = Image.open(image_path) prompt_image = np.tile(np.array(image), (genie.prompt_horizon, 1, 1, 1)).astype(np.uint8) prompt_action = np.zeros((genie.prompt_horizon - 1, genie.action_stride, 2)).astype(np.float32) genie.set_initial_state((prompt_image, prompt_action)) reset_image = genie.reset() reset_image = cv2.resize(reset_image, (RES, RES)) return Image.fromarray(reset_image) @spaces.GPU def model(direction, genie): if direction == 'right': action = np.array([0, 0.05]) elif direction == 'left': action = np.array([0, -0.05]) elif direction == 'down': action = np.array([0.05, 0]) elif direction == 'up': action = np.array([-0.05, 0]) else: raise ValueError(f"Invalid direction: {direction}") next_image = genie.step(action)['pred_next_frame'] next_image = cv2.resize(next_image, (RES, RES)) return Image.fromarray(next_image) @spaces.GPU def handle_input(direction): print(f"User clicked: {direction}") new_image = genie(direction) return new_image @spaces.GPU def handle_image_selection(image_name, state): print(f"User selected image: {image_name}") return initialize_simulator(image_name, state) genie = GenieSimulator( image_encoder_type='temporalvae', image_encoder_ckpt='stabilityai/stable-video-diffusion-img2vid', quantize=False, backbone_type='stmar', backbone_ckpt='data/mar_ckpt/langtable', prompt_horizon=PROMPT_HORIZON, action_stride=1, domain='language_table', device="cuda" ) image = Image.open("sim/assets/langtable_prompt/frame_06.png") prompt_image = np.tile( np.array(image), (genie.prompt_horizon, 1, 1, 1) ).astype(np.uint8) prompt_action = np.zeros( (genie.prompt_horizon, genie.action_stride, 2) ).astype(np.float32) genie.set_initial_state((prompt_image, prompt_action)) genie.device = "cuda" if __name__ == '__main__': with gr.Blocks() as demo: genie.device = "cuda" with gr.Row(): gr.Textbox(label='HMA Demo: Select a prompt initial image from the gallery and Interact with arrow keys. \n' 'Note: the speed is limited due to free GPU in HF and the interface supports one user at a time.', lines=1) with gr.Row(): image_selector = gr.Dropdown( choices=available_images, value=available_images[0], label="Select an Image" ) select_button = gr.Button("Load Image") with gr.Row(): image_display = gr.Image(type="pil", label="Generated Image") with gr.Row(): up = gr.Button("↑ Up") with gr.Row(): left = gr.Button("← Left") down = gr.Button("↓ Down") right = gr.Button("→ Right") # Define interactions select_button.click( fn=handle_image_selection, inputs=[image_selector, genie], outputs=image_display, show_progress='hidden' ) up.click(fn=lambda: handle_input("up"), outputs=image_display, show_progress='hidden') down.click(fn=lambda: handle_input("down"), outputs=image_display, show_progress='hidden') left.click(fn=lambda: handle_input("left"), outputs=image_display, show_progress='hidden') right.click(fn=lambda: handle_input("right"), outputs=image_display, show_progress='hidden') demo.launch(share=True)