Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

Andrei Cozma commited on Apr 22, 2023

Commit

1663f39

1 Parent(s): 3c4eba9

Updates

Browse files

Files changed (7) hide show

MonteCarloAgent.py +90 -25
README.md +23 -61
demo.py +59 -5
policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy +0 -0
policies/MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy +0 -0
run_tests_MC_CliffWalking-v0.py +34 -0
run_tests.py → run_tests_MC_FrozenLake-v1.py +13 -7

MonteCarloAgent.py CHANGED Viewed

@@ -3,18 +3,36 @@ import numpy as np
 import gymnasium as gym
 from tqdm import tqdm
 import argparse
 import wandb
 class MonteCarloAgent:
-    def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
         print("=" * 80)
         print(f"# MonteCarloAgent - {env_name}")
         print(f"- epsilon: {epsilon}")
         print(f"- gamma: {gamma}")
-        self.env = gym.make(env_name, **kwargs)
         self.epsilon, self.gamma = epsilon, gamma
         self.n_states, self.n_actions = (
             self.env.observation_space.n,
             self.env.action_space.n,
@@ -43,17 +61,21 @@ class MonteCarloAgent:
         print(self.Pi)
         print("=" * 80)
-    def choose_action(self, state, override_epsilon=False, **kwargs):
         # Sample an action from the policy.
         # The override_epsilon argument allows forcing the use of a possibly new self.epsilon value than the one used during training.
         # The ability to override was mostly added for testing purposes and for the demo.
-        if override_epsilon is False:
             return np.random.choice(self.n_actions, p=self.Pi[state])
         return np.random.choice(
-            [np.argmax(self.Pi[state]), np.random.randint(self.n_actions)],
-            p=[1 - self.epsilon, self.epsilon],
         )
     def generate_episode(self, max_steps=500, render=False, **kwargs):
@@ -61,24 +83,36 @@ class MonteCarloAgent:
         episode_hist, solved, rgb_array = [], False, None
         # Generate an episode following the current policy
-        for _ in range(max_steps):
             rgb_array = self.env.render() if render else None
             # Sample an action from the policy
             action = self.choose_action(state, **kwargs)
             # Take the action and observe the reward and next state
             next_state, reward, done, truncated, _ = self.env.step(action)
             # Keeping track of the trajectory
             episode_hist.append((state, action, reward))
-            state = next_state
             yield episode_hist, solved, rgb_array
-            # This is where the agent got to the goal.
-            # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
-            if done or truncated:
                 solved = True
                 break
         rgb_array = self.env.render() if render else None
         yield episode_hist, solved, rgb_array
@@ -135,14 +169,24 @@ class MonteCarloAgent:
         test_every=100,
         update_type="first_visit",
         log_wandb=False,
         **kwargs,
     ):
         print(f"Training agent for {n_train_episodes} episodes...")
-        train_running_success_rate, test_success_rate = 0.0, 0.0
         stats = {
             "train_running_success_rate": train_running_success_rate,
             "test_success_rate": test_success_rate,
         }
         update_func = getattr(self, f"update_{update_type}")
@@ -157,36 +201,52 @@ class MonteCarloAgent:
             episode_hist, solved, _ = self.run_episode(**kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
             train_running_success_rate = (
                 0.99 * train_running_success_rate + 0.01 * solved
             )
             update_func(episode_hist)
             stats = {
                 "train_running_success_rate": train_running_success_rate,
                 "test_success_rate": test_success_rate,
                 "total_reward": total_reward,
                 "avg_reward": avg_reward,
             }
             tqrange.set_postfix(stats)
             if e % test_every == 0:
                 test_success_rate = self.test(verbose=False, **kwargs)
                 if log_wandb:
                     self.wandb_log_img(episode=e)
             stats["test_success_rate"] = test_success_rate
             tqrange.set_postfix(stats)
             if log_wandb:
                 wandb.log(stats)
-    def test(self, n_test_episodes=100, verbose=True, **kwargs):
         if verbose:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
-            _, solved, _ = self.run_episode(**kwargs)
             num_successes += solved
             if verbose:
                 word = "reached" if solved else "did not reach"
@@ -247,8 +307,8 @@ def main():
     parser.add_argument(
         "--n_train_episodes",
         type=int,
-        default=2000,
-        help="The number of episodes to train for. (default: 2000)",
     )
     parser.add_argument(
         "--n_test_episodes",
@@ -266,8 +326,8 @@ def main():
     parser.add_argument(
         "--max_steps",
         type=int,
-        default=500,
-        help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
     )
     parser.add_argument(
@@ -295,14 +355,14 @@ def main():
     parser.add_argument(
         "--gamma",
         type=float,
-        default=0.99,
-        help="The value for the discount factor to use. (default: 0.99)",
     )
     parser.add_argument(
         "--epsilon",
         type=float,
-        default=0.1,
-        help="The value for the epsilon-greedy policy to use. (default: 0.1)",
     )
     ### Environment parameters
@@ -310,6 +370,7 @@ def main():
         "--env",
         type=str,
         default="CliffWalking-v0",
         help="The Gymnasium environment to use. (default: CliffWalking-v0)",
     )
     parser.add_argument(
@@ -352,10 +413,12 @@ def main():
         render_mode=args.render_mode,
     )
-    run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
     if args.wandb_run_name_suffix is not None:
         run_name += f"+{args.wandb_run_name_suffix}"
     try:
         if args.train:
             # Log to WandB
@@ -375,6 +438,8 @@ def main():
                 max_steps=args.max_steps,
                 update_type=args.update_type,
                 log_wandb=args.wandb_project is not None,
             )
             if not args.no_save:
                 agent.save_policy(

 import gymnasium as gym
 from tqdm import tqdm
 import argparse
+from gymnasium.envs.toy_text.frozen_lake import generate_random_map
 import wandb
 class MonteCarloAgent:
+    def __init__(
+        self,
+        env_name="CliffWalking-v0",
+        gamma=0.99,
+        epsilon=0.1,
+        run_name=None,
+        **kwargs,
+    ):
         print("=" * 80)
         print(f"# MonteCarloAgent - {env_name}")
         print(f"- epsilon: {epsilon}")
         print(f"- gamma: {gamma}")
+        print(f"- run_name: {run_name}")
+        self.run_name = run_name
+        self.env_name = env_name
         self.epsilon, self.gamma = epsilon, gamma
+        self.env_kwargs = kwargs
+        if self.env_name == "FrozenLake-v1":
+            self.env_kwargs["desc"] = None
+            self.env_kwargs["map_name"] = "4x4"
+            self.env_kwargs["is_slippery"] = "False"
+        self.env = gym.make(self.env_name, **self.env_kwargs)
         self.n_states, self.n_actions = (
             self.env.observation_space.n,
             self.env.action_space.n,
         print(self.Pi)
         print("=" * 80)
+    def choose_action(self, state, epsilon_override=None, greedy=False, **kwargs):
         # Sample an action from the policy.
         # The override_epsilon argument allows forcing the use of a possibly new self.epsilon value than the one used during training.
         # The ability to override was mostly added for testing purposes and for the demo.
+        greedy_action = np.argmax(self.Pi[state])
+        if greedy:
+            return greedy_action
+        if epsilon_override is None:
             return np.random.choice(self.n_actions, p=self.Pi[state])
         return np.random.choice(
+            [greedy_action, np.random.randint(self.n_actions)],
+            p=[1 - epsilon_override, epsilon_override],
         )
     def generate_episode(self, max_steps=500, render=False, **kwargs):
         episode_hist, solved, rgb_array = [], False, None
         # Generate an episode following the current policy
+        while len(episode_hist) < max_steps:
             rgb_array = self.env.render() if render else None
             # Sample an action from the policy
             action = self.choose_action(state, **kwargs)
             # Take the action and observe the reward and next state
             next_state, reward, done, truncated, _ = self.env.step(action)
             # Keeping track of the trajectory
             episode_hist.append((state, action, reward))
             yield episode_hist, solved, rgb_array
+            # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
+            if done and (
+                self.env_name == "CliffWalking-v0" or self.env_name == "Taxi-v3"
+            ):
                 solved = True
                 break
+            # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
+            # We consider the episode solved when the agent reaches the goal (done == True and reward == 1)
+            if done and self.env_name == "FrozenLake-v1" and reward == 1:
+                solved = True
+                break
+            if done or truncated:
+                break
+            state = next_state
         rgb_array = self.env.render() if render else None
         yield episode_hist, solved, rgb_array
         test_every=100,
         update_type="first_visit",
         log_wandb=False,
+        save_best=True,
+        save_best_dir=None,
         **kwargs,
     ):
         print(f"Training agent for {n_train_episodes} episodes...")
+        (
+            train_running_success_rate,
+            test_success_rate,
+            test_running_success_rate,
+            avg_ep_len,
+        ) = (0.0, 0.0, 0.0, 0.0)
         stats = {
             "train_running_success_rate": train_running_success_rate,
+            "test_running_success_rate": test_running_success_rate,
             "test_success_rate": test_success_rate,
+            "avg_ep_len": avg_ep_len,
         }
         update_func = getattr(self, f"update_{update_type}")
             episode_hist, solved, _ = self.run_episode(**kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
             train_running_success_rate = (
                 0.99 * train_running_success_rate + 0.01 * solved
             )
+            avg_ep_len = 0.99 * avg_ep_len + 0.01 * len(episode_hist)
             update_func(episode_hist)
             stats = {
                 "train_running_success_rate": train_running_success_rate,
+                "test_running_success_rate": test_running_success_rate,
                 "test_success_rate": test_success_rate,
+                "avg_ep_len": avg_ep_len,
                 "total_reward": total_reward,
                 "avg_reward": avg_reward,
             }
             tqrange.set_postfix(stats)
+            # Test the agent every test_every episodes with the greedy policy (by default)
             if e % test_every == 0:
                 test_success_rate = self.test(verbose=False, **kwargs)
+                if save_best and test_success_rate > 0.9:
+                    if self.run_name is None:
+                        print(f"Warning: run_name is None, not saving best policy")
+                    else:
+                        self.save_policy(self.run_name, save_best_dir)
                 if log_wandb:
                     self.wandb_log_img(episode=e)
+            test_running_success_rate = (
+                0.99 * test_running_success_rate + 0.01 * test_success_rate
+            )
+            stats["test_running_success_rate"] = test_running_success_rate
             stats["test_success_rate"] = test_success_rate
             tqrange.set_postfix(stats)
             if log_wandb:
                 wandb.log(stats)
+    def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
         if verbose:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
+            _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
             num_successes += solved
             if verbose:
                 word = "reached" if solved else "did not reach"
     parser.add_argument(
         "--n_train_episodes",
         type=int,
+        default=2500,
+        help="The number of episodes to train for. (default: 2500)",
     )
     parser.add_argument(
         "--n_test_episodes",
     parser.add_argument(
         "--max_steps",
         type=int,
+        default=200,
+        help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
     )
     parser.add_argument(
     parser.add_argument(
         "--gamma",
         type=float,
+        default=1.0,
+        help="The value for the discount factor to use. (default: 1.0)",
     )
     parser.add_argument(
         "--epsilon",
         type=float,
+        default=0.4,
+        help="The value for the epsilon-greedy policy to use. (default: 0.4)",
     )
     ### Environment parameters
         "--env",
         type=str,
         default="CliffWalking-v0",
+        choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
         help="The Gymnasium environment to use. (default: CliffWalking-v0)",
     )
     parser.add_argument(
         render_mode=args.render_mode,
     )
+    run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
     if args.wandb_run_name_suffix is not None:
         run_name += f"+{args.wandb_run_name_suffix}"
+    agent.run_name = run_name
     try:
         if args.train:
             # Log to WandB
                 max_steps=args.max_steps,
                 update_type=args.update_type,
                 log_wandb=args.wandb_project is not None,
+                save_best=True,
+                save_best_dir=args.save_dir,
             )
             if not args.no_save:
                 agent.save_policy(

README.md CHANGED Viewed

@@ -8,28 +8,43 @@ Evolution of Reinforcement Learning methods from pure Dynamic Programming-based
 - Python 3
 - Gymnasium: <https://pypi.org/project/gymnasium/>
-- WandB: <https://pypi.org/project/wandb/>
-- Gradio: <https://pypi.org/project/gradio/>
 ## Interactive Demo
 TODO
-## Dynamic-Programming Agent
 TODO
-### Usage
 ```bash
 TODO
 ```
-## Monte-Carlo Agent
-The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy.
-Primarily tested on the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
 ```bash
 # Training: Policy will be saved as a `.npy` file.
@@ -39,7 +54,7 @@ python3 MonteCarloAgent.py --train
 python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
 ```
-### Usage
 ```bash
 usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}]
@@ -76,56 +91,3 @@ options:
   --wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
                         WandB run name suffix for logging. (default: None)
 ```
-## Presentation Guide
-1. Title Slide: list the title of your talk along with your name
-2. Test Questions Slide: provide three questions relevant to your subject
-- short answers should suffice
-- somewhere during your talk provide the answers, but do not emphasize them
-3. Presenter’s Slides: let others get to know you
-- provide a little information about yourself, your degree program and your advisor
-- describe your interests and goals; show a map and picture(s) of your hometown
-- as examples, students frequently like to mention their pets, their travels, their interests in music and food, even their favorite movies, you name it
-4. Outline Slide: provide a bulleted outline of the rest of your talk
-5. Overview Slide: list important definitions and provide a brief mention of applications
-6. History Slide: discuss major contributors, interesting stories and main developments
-7. Algorithms Slides: describe basic procedures and methodological comparisons
-- this should be the main part of your talk
-- discuss techniques from the most basic to the state-of-the-art
-- use examples and figures whenever possible
-8. Applications Slides: educate the class about amenable problems of interest to you
-- don’t get bogged down in too much minutiae
-- once again use examples and figures whenever possible
-9. Implementations Slides: discuss the results of your coding work (if any)
-- compare and contrast the algorithms you implemented
-- make effective use of table and charts
-10. Open Issues Slide: enumerate and discuss a few open questions
-11. References Slide: provide a handful of key citations
-12. Discussion Slide: solicit questions from the class
-- this slide may have only a few bullets – it may even be left blank
-- this is a good opportunity for other students to add to the discussion
-- be ready to prompt some questions if there is silence
-- remember not to repeat answers to your test questions
-13. Test Questions Slide Revisited: show again your original test questions slide
-- students may now complete their answer sheets and hand them to you
-- Ashley will supervise as we applaud your excellent presentation!

 - Python 3
 - Gymnasium: <https://pypi.org/project/gymnasium/>
+- WandB: <https://pypi.org/project/wandb/> (for logging)
+- Gradio: <https://pypi.org/project/gradio/> (for demo web app)
 ## Interactive Demo
 TODO
+## 2. Agents
+### Dynamic-Programming Agent
 TODO
+**DP Usage:**
 ```bash
 TODO
 ```
+### Monte-Carlo Agent
+This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
+The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
+Off-policy methods using importance sampling are not implemented for this project.
+Parameter testing results:
+- `run_tests_MC_CliffWalking-v0.sh` (n_train_episodes=2500 and max_steps=200)
+  - Best Update Type: first_visit
+  - Best Gamma: 1.0
+  - Best Epsilon: 0.4
+- `run_tests_MC_FrozenLake-v1.sh` (n_train_episodes=10000 and max_steps=200)
+  - Best Update Type: first_visit
+  - Best Gamma: 1.0
+  - Best Epsilon: 0.5 (testing) and 0.2 (training)
 ```bash
 # Training: Policy will be saved as a `.npy` file.
 python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
 ```
+**MC Usage**
 ```bash
 usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}]
   --wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
                         WandB run name suffix for logging. (default: None)
 ```

demo.py CHANGED Viewed

@@ -31,6 +31,12 @@ action_map = {
         2: "down",
         3: "left",
     },
 }
@@ -45,6 +51,12 @@ live_render_fps = 5
 live_epsilon = 0.0
 live_paused = True
 live_steps_forward = None
 def change_render_fps(x):
@@ -77,7 +89,7 @@ def onclick_btn_forward():
 def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
-    global live_render_fps, live_epsilon, live_paused, live_steps_forward
     live_render_fps = render_fps
     live_epsilon = epsilon
     live_steps_forward = None
@@ -124,7 +136,7 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
     for episode in range(n_test_episodes):
         for step, (episode_hist, solved, rgb_array) in enumerate(
             agent.generate_episode(
-                max_steps=max_steps, render=True, override_epsilon=True
             )
         ):
             _, _, last_reward = (
@@ -133,6 +145,30 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
             state, action, reward = episode_hist[-1]
             curr_policy = agent.Pi[state]
             viz_w = 512
             viz_h = viz_w // len(curr_policy)
             policy_viz = np.zeros((viz_h, viz_w))
@@ -189,8 +225,6 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
                 f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
             )
-            # Live-update the agent's epsilon value for demonstration purposes
-            agent.epsilon = live_epsilon
             yield agent_type, env_name, rgb_array, policy_viz, ep_str(
                 episode + 1
             ), ep_str(episodes_solved), step_str(
@@ -214,6 +248,24 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
                     step
                 ), state, action, last_reward, "Paused..."
                 time.sleep(1 / live_render_fps)
         if solved:
             episodes_solved += 1
@@ -318,6 +370,8 @@ with gr.Blocks(title="CS581 Demo") as demo:
         label="Status Message",
     )
     btn_run.click(
         fn=run,
         inputs=[
@@ -342,5 +396,5 @@ with gr.Blocks(title="CS581 Demo") as demo:
         ],
     )
-demo.queue(concurrency_count=2)
 demo.launch()

         2: "down",
         3: "left",
     },
+    "FrozenLake-v1": {
+        0: "left",
+        1: "down",
+        2: "right",
+        3: "up",
+    },
 }
 live_epsilon = 0.0
 live_paused = True
 live_steps_forward = None
+should_reset = False
+# def reset():
+#     global should_reset
+#     should_reset = True
 def change_render_fps(x):
 def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
+    global live_render_fps, live_epsilon, live_paused, live_steps_forward, should_reset
     live_render_fps = render_fps
     live_epsilon = epsilon
     live_steps_forward = None
     for episode in range(n_test_episodes):
         for step, (episode_hist, solved, rgb_array) in enumerate(
             agent.generate_episode(
+                max_steps=max_steps, render=True, epsilon_override=live_epsilon
             )
         ):
             _, _, last_reward = (
             state, action, reward = episode_hist[-1]
             curr_policy = agent.Pi[state]
+            rgb_array_height, rgb_array_width = 128, 512
+            rgb_array = cv2.resize(
+                rgb_array,
+                (
+                    int(rgb_array.shape[1] / rgb_array.shape[0] * rgb_array_height),
+                    rgb_array_height,
+                ),
+                interpolation=cv2.INTER_AREA,
+            )
+            rgb_array_new = np.pad(
+                rgb_array,
+                (
+                    (0, 0),
+                    (
+                        (rgb_array_width - rgb_array.shape[1]) // 2,
+                        (rgb_array_width - rgb_array.shape[1]) // 2,
+                    ),
+                    (0, 0),
+                ),
+                "constant",
+            )
+            rgb_array = np.uint8(rgb_array_new)
             viz_w = 512
             viz_h = viz_w // len(curr_policy)
             policy_viz = np.zeros((viz_h, viz_w))
                 f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
             )
             yield agent_type, env_name, rgb_array, policy_viz, ep_str(
                 episode + 1
             ), ep_str(episodes_solved), step_str(
                     step
                 ), state, action, last_reward, "Paused..."
                 time.sleep(1 / live_render_fps)
+            #     if should_reset is True:
+            #         break
+            # if should_reset is True:
+            #     should_reset = False
+            #     return (
+            #         agent_type,
+            #         env_name,
+            #         rgb_array,
+            #         policy_viz,
+            #         ep_str(episode + 1),
+            #         ep_str(episodes_solved),
+            #         step_str(step),
+            #         state,
+            #         action,
+            #         last_reward,
+            #         "Resetting...",
+            #     )
         if solved:
             episodes_solved += 1
         label="Status Message",
     )
+    # input_policy.change(fn=reset)
     btn_run.click(
         fn=run,
         inputs=[
         ],
     )
+demo.queue(concurrency_count=3)
 demo.launch()

policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy DELETED Viewed

Binary file (1.66 kB)

policies/MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy ADDED Viewed

Binary file (1.66 kB). View file

run_tests_MC_CliffWalking-v0.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import multiprocessing
+import random
+wandb_project = "cs581"
+env = "CliffWalking-v0"
+n_train_episodes = 2500
+max_steps = 200
+num_tests = 10
+vals_update_type = [
+    "first_visit"
+]  # Every visit takes too long due to this environment's reward structure
+vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
+vals_gamma = [1.0, 0.98, 0.96, 0.94]
+def run_test(args):
+    os.system(
+        f"python3 MonteCarloAgent.py --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
+    )
+with multiprocessing.Pool(16) as p:
+    tests = []
+    for update_type in vals_update_type:
+        for gamma in vals_gamma:
+            for eps in vals_epsilon:
+                tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
+    random.shuffle(tests)
+    p.map(run_test, tests)

run_tests.py → run_tests_MC_FrozenLake-v1.py RENAMED Viewed

@@ -2,24 +2,30 @@ import os
 import multiprocessing
 import random
 num_tests = 10
-update_types = ["first_visit", "every_visit"]
-vals_eps = [0.1, 0.2, 0.3, 0.4, 0.5]
-vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.96, 0.95]
 def run_test(args):
     os.system(
-        f"python3 MonteCarloAgent.py --train  --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project cs581 --wandb_job_type params --wandb_run_name_suffix {args[3]} --no_save"
     )
-with multiprocessing.Pool(12) as p:
     tests = []
-    for update_type in update_types:
         for gamma in vals_gamma:
-            for eps in vals_eps:
                 tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
     random.shuffle(tests)

 import multiprocessing
 import random
+wandb_project = "cs581"
+env = "FrozenLake-v1"
+n_train_episodes = 5000
+max_steps = 200
 num_tests = 10
+vals_update_type = ["first_visit", "every_visit"]
+vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
+vals_gamma = [1.0, 0.98, 0.96, 0.94]
 def run_test(args):
     os.system(
+        f"python3 MonteCarloAgent.py --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
     )
+with multiprocessing.Pool(16) as p:
     tests = []
+    for update_type in vals_update_type:
         for gamma in vals_gamma:
+            for eps in vals_epsilon:
                 tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
     random.shuffle(tests)