Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

Andrei Cozma commited on Apr 20, 2023

Commit

8ceccef

1 Parent(s): b8a5776

Updates

Browse files

Files changed (4) hide show

MonteCarloAgent.py +81 -23
demo.py +162 -0
policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.5.npy +0 -0
run_tests.py +1 -1

MonteCarloAgent.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 import gymnasium as gym
 from tqdm import tqdm
@@ -8,6 +9,7 @@ import wandb
 class MonteCarloAgent:
     def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
         print(f"# MonteCarloAgent - {env_name}")
         print(f"- epsilon: {epsilon}")
         print(f"- gamma: {gamma}")
@@ -45,25 +47,68 @@ class MonteCarloAgent:
         # Sample an action from the policy
         return np.random.choice(self.n_actions, p=self.Pi[state])
-    def run_episode(self, max_steps=500, **kwargs):
         state, _ = self.env.reset()
-        episode_hist = []
-        finished = False
         # Generate an episode following the current policy
         for _ in range(max_steps):
             # Sample an action from the policy
             action = self.choose_action(state)
             # Take the action and observe the reward and next state
-            next_state, reward, finished, truncated, _ = self.env.step(action)
             # Keeping track of the trajectory
             episode_hist.append((state, action, reward))
             state = next_state
             # This is where the agent got to the goal.
             # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
-            if finished or truncated:
                 break
-        return episode_hist, finished
     def update_first_visit(self, episode_hist):
         G = 0
@@ -127,11 +172,11 @@ class MonteCarloAgent:
             self.wandb_log_img(episode=None)
         for e in tqrange:
-            episode_hist, finished = self.run_episode(**kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
             train_running_success_rate = (
-                0.99 * train_running_success_rate + 0.01 * finished
             )
             update_func(episode_hist)
@@ -159,11 +204,11 @@ class MonteCarloAgent:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
-            _, finished = self.run_episode(**kwargs)
-            num_successes += finished
             if verbose:
-                word = "reached" if finished else "did not reach"
-                emoji = "🏁" if finished else "🚫"
                 print(
                     f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
                 )
@@ -175,15 +220,18 @@ class MonteCarloAgent:
             )
         return success_rate
-    def save_policy(self, fname="policy.npy"):
-        print(f"Saving policy to {fname}")
         np.save(fname, self.Pi)
     def load_policy(self, fname="policy.npy"):
-        print(f"Loading policy from {fname}")
         self.Pi = np.load(fname)
-    def wandb_log_img(self, episode=None, mask=None):
         caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
         wandb.log(
             {
@@ -248,6 +296,13 @@ def main():
         help="The type of update to use. (default: first_visit)",
     )
     parser.add_argument(
         "--no_save",
         action="store_true",
@@ -264,7 +319,7 @@ def main():
     parser.add_argument(
         "--epsilon",
         type=float,
-        default=0.7,
         help="The value for the epsilon-greedy policy to use. (default: 0.1)",
     )
@@ -308,14 +363,14 @@ def main():
     args = parser.parse_args()
-    mca = MonteCarloAgent(
         args.env,
         gamma=args.gamma,
         epsilon=args.epsilon,
         render_mode=args.render_mode,
     )
-    run_name = f"mc_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
     if args.wandb_run_name_suffix is not None:
         run_name += f"+{args.wandb_run_name_suffix}"
@@ -331,7 +386,7 @@ def main():
                     config=dict(args._get_kwargs()),
                 )
-            mca.train(
                 n_train_episodes=args.n_train_episodes,
                 test_every=args.test_every,
                 n_test_episodes=args.n_test_episodes,
@@ -340,12 +395,15 @@ def main():
                 log_wandb=args.wandb_project is not None,
             )
             if not args.no_save:
-                mca.save_policy(fname=f"policy_{run_name}.npy")
         elif args.test is not None:
             if not args.test.endswith(".npy"):
                 args.test += ".npy"
-            mca.load_policy(args.test)
-            mca.test(
                 n_test_episodes=args.n_test_episodes,
                 max_steps=args.max_steps,
             )

+import os
 import numpy as np
 import gymnasium as gym
 from tqdm import tqdm
 class MonteCarloAgent:
     def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
+        print("=" * 80)
         print(f"# MonteCarloAgent - {env_name}")
         print(f"- epsilon: {epsilon}")
         print(f"- gamma: {gamma}")
         # Sample an action from the policy
         return np.random.choice(self.n_actions, p=self.Pi[state])
+    # def run_episode(self, max_steps=500, render=False, **kwargs):
+    #     state, _ = self.env.reset()
+    #     episode_hist, solved, rgb_array = [], False, None
+    #     # Generate an episode following the current policy
+    #     for _ in range(max_steps):
+    #         rgb_array = self.env.render() if render else None
+    #         # Sample an action from the policy
+    #         action = self.choose_action(state)
+    #         # Take the action and observe the reward and next state
+    #         next_state, reward, done, truncated, _ = self.env.step(action)
+    #         # Keeping track of the trajectory
+    #         episode_hist.append((state, action, reward))
+    #         state = next_state
+    #         # This is where the agent got to the goal.
+    #         # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
+    #         if done:
+    #             solved = True
+    #             break
+    #         if truncated:
+    #             break
+    #     rgb_array = self.env.render() if render else None
+    #     return episode_hist, solved, rgb_array
+    def generate_episode(self, max_steps=500, render=False, **kwargs):
         state, _ = self.env.reset()
+        episode_hist, solved, rgb_array = [], False, None
         # Generate an episode following the current policy
         for _ in range(max_steps):
+            rgb_array = self.env.render() if render else None
             # Sample an action from the policy
             action = self.choose_action(state)
             # Take the action and observe the reward and next state
+            next_state, reward, done, truncated, _ = self.env.step(action)
             # Keeping track of the trajectory
             episode_hist.append((state, action, reward))
             state = next_state
+            yield episode_hist, solved, rgb_array
             # This is where the agent got to the goal.
             # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
+            if done or truncated:
+                solved = True
                 break
+        rgb_array = self.env.render() if render else None
+        yield episode_hist, solved, rgb_array
+    def run_episode(self, max_steps=500, render=False, **kwargs):
+        # Run the generator until the end
+        episode_hist, solved, rgb_array = None, False, None
+        for episode_hist, solved, rgb_array in self.generate_episode(
+            max_steps, render, **kwargs
+        ):
+            pass
+        return episode_hist, solved, rgb_array
     def update_first_visit(self, episode_hist):
         G = 0
             self.wandb_log_img(episode=None)
         for e in tqrange:
+            episode_hist, solved, _ = self.run_episode(**kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
             train_running_success_rate = (
+                0.99 * train_running_success_rate + 0.01 * solved
             )
             update_func(episode_hist)
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
+            _, solved, _ = self.run_episode(**kwargs)
+            num_successes += solved
             if verbose:
+                word = "reached" if solved else "did not reach"
+                emoji = "🏁" if solved else "🚫"
                 print(
                     f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
                 )
             )
         return success_rate
+    def save_policy(self, fname="policy.npy", save_dir=None):
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+            fname = os.path.join(save_dir, fname)
+        print(f"Saving policy to: {fname}")
         np.save(fname, self.Pi)
     def load_policy(self, fname="policy.npy"):
+        print(f"Loading policy from: {fname}")
         self.Pi = np.load(fname)
+    def wandb_log_img(self, episode=None):
         caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
         wandb.log(
             {
         help="The type of update to use. (default: first_visit)",
     )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="policies",
+        help="The directory to save the policy to. (default: policies)",
+    )
     parser.add_argument(
         "--no_save",
         action="store_true",
     parser.add_argument(
         "--epsilon",
         type=float,
+        default=0.5,
         help="The value for the epsilon-greedy policy to use. (default: 0.1)",
     )
     args = parser.parse_args()
+    agent = MonteCarloAgent(
         args.env,
         gamma=args.gamma,
         epsilon=args.epsilon,
         render_mode=args.render_mode,
     )
+    run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
     if args.wandb_run_name_suffix is not None:
         run_name += f"+{args.wandb_run_name_suffix}"
                     config=dict(args._get_kwargs()),
                 )
+            agent.train(
                 n_train_episodes=args.n_train_episodes,
                 test_every=args.test_every,
                 n_test_episodes=args.n_test_episodes,
                 log_wandb=args.wandb_project is not None,
             )
             if not args.no_save:
+                agent.save_policy(
+                    fname=f"{run_name}.npy",
+                    save_dir=args.save_dir,
+                )
         elif args.test is not None:
             if not args.test.endswith(".npy"):
                 args.test += ".npy"
+            agent.load_policy(args.test)
+            agent.test(
                 n_test_episodes=args.n_test_episodes,
                 max_steps=args.max_steps,
             )

demo.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import time
+import numpy as np
+import gradio as gr
+from MonteCarloAgent import MonteCarloAgent
+# For the dropdown list of policies
+policies_folder = "policies"
+all_policies = [file for file in os.listdir(policies_folder) if file.endswith(".npy")]
+# All supported agents
+agent_map = {
+    "MonteCarloAgent": MonteCarloAgent,
+    # TODO: Add DP Agent
+}
+# Global variables for the agent and the render fps (to allow changing it on the fly)
+agent = None
+render_fps_val = 5
+def load_policy(policy_fname):
+    print("Loading...")
+    print(f"- policy_fname: {policy_fname}")
+    global agent
+    policy_path = os.path.join(policies_folder, policy_fname)
+    props = policy_fname.split("_")
+    agent_type, env_name = props[0], props[1]
+    agent = agent_map[agent_type](env_name, render_mode="rgb_array")
+    agent.load_policy(policy_path)
+    return agent.env.spec.id, agent.__class__.__name__
+def change_render_fps(x):
+    print("Changing render fps:", x)
+    global render_fps_val
+    render_fps_val = x
+def run(n_test_episodes, max_steps, render_fps):
+    global agent, render_fps_val
+    render_fps_val = render_fps
+    print("Running...")
+    print(f"- n_test_episodes: {n_test_episodes}")
+    print(f"- max_steps: {max_steps}")
+    print(f"- render_fps: {render_fps_val}")
+    while agent is None:
+        print("Waiting for agent to be loaded...")
+        time.sleep(1)
+        yield None, None, None, None, None, None, None, "🚫 ERROR: Please load a policy first!"
+    rgb_array = np.random.random((25, 100, 3))
+    episode, step = 0, 0
+    state, action, reward = 0, 0, 0
+    episodes_solved = 0
+    def ep_str(episode):
+        return f"{episode + 1} / {n_test_episodes} ({(episode + 1) / n_test_episodes * 100:.2f}%)"
+    def step_str(step):
+        return f"{step + 1}"
+    for episode in range(n_test_episodes):
+        for step, (episode_hist, solved, rgb_array) in enumerate(
+            agent.generate_episode(max_steps=max_steps, render=True)
+        ):
+            if solved:
+                episodes_solved += 1
+            state, action, reward = episode_hist[-1]
+            print(
+                f"Episode: {ep_str(episode)} - step: {step_str} - state: {state} - action: {action} - reward: {reward} (frame time: {1 / render_fps:.2f}s)"
+            )
+            time.sleep(1 / render_fps_val)
+            yield rgb_array, ep_str(episode), step_str(step), state, action, reward, ep_str(episodes_solved), "Running..."
+    yield rgb_array, ep_str(episode), step_str(step), state, action, reward, ep_str(episodes_solved), "Done!"
+with gr.Blocks() as demo:
+    # TODO: Add title and description
+    with gr.Row():
+        with gr.Column():
+            input_policy = gr.components.Dropdown(
+                label="Policy", choices=all_policies, value=all_policies[0]
+            )
+            with gr.Row():
+                out_environment = gr.components.Textbox(label="Environment")
+                out_agent = gr.components.Textbox(label="Agent")
+            btn_load = gr.components.Button("📁 Load")
+            btn_load.click(
+                fn=load_policy,
+                inputs=[input_policy],
+                outputs=[out_environment, out_agent],
+            )
+        with gr.Column():
+            input_n_test_episodes = gr.components.Slider(
+                minimum=1,
+                maximum=100,
+                value=5,
+                label="Number of episodes",
+            )
+            input_max_steps = gr.components.Slider(
+                minimum=1,
+                maximum=500,
+                value=500,
+                label="Max steps per episode",
+            )
+            input_render_fps = gr.components.Slider(
+                minimum=1,
+                maximum=60,
+                value=5,
+                label="Render FPS",
+            )
+            input_render_fps.change(change_render_fps, inputs=[input_render_fps])
+            btn_run = gr.components.Button("▶️ Run")
+    out_msg = gr.components.Textbox(label="Message")
+    with gr.Row():
+        out_episode = gr.components.Textbox(label="Current Episode")
+        out_step = gr.components.Textbox(label="Current Step")
+        out_state = gr.components.Textbox(label="Current State")
+        out_action = gr.components.Textbox(label="Chosen Action")
+        out_reward = gr.components.Textbox(label="Reward Received")
+        out_eps_solved = gr.components.Textbox(label="Episodes Solved")
+    out_image = gr.components.Image(label="Environment", type="numpy", image_mode="RGB")
+    btn_run.click(
+        fn=run,
+        inputs=[
+            input_n_test_episodes,
+            input_max_steps,
+            input_render_fps,
+        ],
+        outputs=[
+            out_image,
+            out_episode,
+            out_step,
+            out_state,
+            out_action,
+            out_reward,
+            out_eps_solved,
+            out_msg,
+        ],
+    )
+demo.queue(concurrency_count=2)
+demo.launch()

policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.5.npy ADDED Viewed

Binary file (1.66 kB). View file

run_tests.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import multiprocessing
 import random
-num_tests = 10
 update_types = ["first_visit", "every_visit"]
 vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]

 import multiprocessing
 import random
+num_tests = 5
 update_types = ["first_visit", "every_visit"]
 vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]