import os import time from matplotlib import interactive import numpy as np import gradio as gr from MonteCarloAgent import MonteCarloAgent import scipy.ndimage import cv2 # For the dropdown list of policies policies_folder = "policies" try: all_policies = [ file for file in os.listdir(policies_folder) if file.endswith(".npy") ] except FileNotFoundError: print("ERROR: No policies folder found!") all_policies = [] # All supported agents agent_map = { "MonteCarloAgent": MonteCarloAgent, # TODO: Add DP Agent } # Global variables to allow changing it on the fly live_render_fps = 10 live_epsilon = 0.0 live_paused = False def change_render_fps(x): print("Changing render fps:", x) global live_render_fps live_render_fps = x def change_epsilon(x): print("Changing greediness:", x) global live_epsilon live_epsilon = x def change_paused(x): print("Changing paused:", x) global live_paused live_paused = x # change the text to resume return gr.update(value="▶️ Resume" if x else "⏸️ Pause") def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon): global live_render_fps, live_epsilon live_render_fps = render_fps live_epsilon = epsilon print("Running...") print(f"- n_test_episodes: {n_test_episodes}") print(f"- max_steps: {max_steps}") print(f"- render_fps: {live_render_fps}") policy_path = os.path.join(policies_folder, policy_fname) props = policy_fname.split("_") agent_type, env_name = props[0], props[1] agent = agent_map[agent_type](env_name, render_mode="rgb_array") agent.load_policy(policy_path) rgb_array = None policy_viz = None episode, step = 0, 0 state, action, reward = 0, 0, 0 episodes_solved = 0 def ep_str(episode): return f"{episode + 1} / {n_test_episodes} ({(episode + 1) / n_test_episodes * 100:.2f}%)" def step_str(step): return f"{step + 1}" for episode in range(n_test_episodes): for step, (episode_hist, solved, rgb_array) in enumerate( agent.generate_episode( max_steps=max_steps, render=True, override_epsilon=True ) ): while live_paused: time.sleep(0.1) if solved: episodes_solved += 1 state, action, reward = episode_hist[-1] curr_policy = agent.Pi[state] viz_w = 512 viz_h = viz_w // len(curr_policy) policy_viz = np.zeros((viz_h, viz_w)) for i, p in enumerate(curr_policy): policy_viz[ :, i * (viz_w // len(curr_policy)) : (i + 1) * (viz_w // len(curr_policy)), ] = p policy_viz = np.stack([policy_viz] * 3, axis=-1) text_offset = 15 cv2.putText( policy_viz, str(action), ( int((action + 0.5) * viz_w // len(curr_policy) - text_offset), viz_h // 2 + text_offset, ), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 255), 1, cv2.LINE_AA, ) policy_viz = scipy.ndimage.gaussian_filter(policy_viz, sigma=1) policy_viz = np.clip( policy_viz * (1 - live_epsilon) + live_epsilon / len(curr_policy), 0, 1 ) print( f"Episode: {ep_str(episode)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (frame time: {1 / render_fps:.2f}s)" ) time.sleep(1 / live_render_fps) # Live-update the agent's epsilon value for demonstration purposes agent.epsilon = live_epsilon yield agent_type, env_name, rgb_array, policy_viz, ep_str(episode), ep_str( episodes_solved ), step_str(step), state, action, reward, "Running..." yield agent_type, env_name, rgb_array, policy_viz, ep_str(episode), ep_str( episodes_solved ), step_str(step), state, action, reward, "Done!" with gr.Blocks(title="CS581 Demo") as demo: gr.components.HTML( "