Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

Andrei Cozma commited on Apr 23, 2023

Commit

46b0409

1 Parent(s): ebd7110

Updates

Browse files

Files changed (11) hide show

Shared.py → AgentBase.py +1 -1
DPAgent.py +12 -6
MCAgent.py +2 -3
README.md +44 -36
demo.py +1 -1
run.py +32 -36
run_tests_MC_CliffWalking-v0.py +0 -34
run_tests_MC_FrozenLake-v1.py +0 -34
run_tests_MC_Taxi-v3.py +0 -34
test_params.py +76 -0
agents.py → utils.py +19 -8

Shared.py → AgentBase.py RENAMED Viewed

@@ -4,7 +4,7 @@ import gymnasium as gym
 from gymnasium.envs.toy_text.frozen_lake import generate_random_map
-class Shared:
     def __init__(
         self,
         /,

 from gymnasium.envs.toy_text.frozen_lake import generate_random_map
+class AgentBase:
     def __init__(
         self,
         /,

DPAgent.py CHANGED Viewed

@@ -3,11 +3,11 @@ import numpy as np
 from gymnasium.envs.toy_text.frozen_lake import generate_random_map
 from matplotlib import pyplot as plt
 from tqdm import trange
-from Shared import Shared
 import warnings
-class DPAgent(Shared):
     def __init__(self, /, **kwargs):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
         self.theta = kwargs.get("theta", 1e-10)
@@ -36,7 +36,10 @@ class DPAgent(Shared):
                     for probability, next_state, reward, done in self.env.P[state][
                         action
                     ]:
-                        if self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n-1:
                             reward = 1
                         expected_value += probability * (
                             reward + self.gamma * self.V[next_state]
@@ -53,14 +56,17 @@ class DPAgent(Shared):
             # if i % 5 == 0 and i != 0:
             #     self.test(verbose=False)
             print(f"Iteration {i}: delta={delta}")
         self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
         for s in range(self.env.observation_space.n):
             for a in range(self.env.action_space.n):
                 expected_value = 0
                 for probability, next_state, reward, done in self.env.P[s][a]:
-                    if self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n-1:
-                            reward = 1
                     expected_value += probability * (
                         reward + self.gamma * self.V[next_state]
                     )

 from gymnasium.envs.toy_text.frozen_lake import generate_random_map
 from matplotlib import pyplot as plt
 from tqdm import trange
+from AgentBase import AgentBase
 import warnings
+class DPAgent(AgentBase):
     def __init__(self, /, **kwargs):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
         self.theta = kwargs.get("theta", 1e-10)
                     for probability, next_state, reward, done in self.env.P[state][
                         action
                     ]:
+                        if (
+                            self.env_name == "CliffWalking-v0"
+                            and state == self.env.observation_space.n - 1
+                        ):
                             reward = 1
                         expected_value += probability * (
                             reward + self.gamma * self.V[next_state]
             # if i % 5 == 0 and i != 0:
             #     self.test(verbose=False)
             print(f"Iteration {i}: delta={delta}")
         self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
         for s in range(self.env.observation_space.n):
             for a in range(self.env.action_space.n):
                 expected_value = 0
                 for probability, next_state, reward, done in self.env.P[s][a]:
+                    if (
+                        self.env_name == "CliffWalking-v0"
+                        and state == self.env.observation_space.n - 1
+                    ):
+                        reward = 1
                     expected_value += probability * (
                         reward + self.gamma * self.V[next_state]
                     )

MCAgent.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import numpy as np
 from tqdm import tqdm
-from Shared import Shared
 import wandb
-from Shared import Shared
-class MCAgent(Shared):
     def __init__(self, /, **kwargs):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
         self.reset()

 import numpy as np
 from tqdm import tqdm
 import wandb
+from AgentBase import AgentBase
+class MCAgent(AgentBase):
     def __init__(self, /, **kwargs):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
         self.reset()

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Reinforcement Learning - From Dynamic Programming to Monte-Carlo
 emoji: 🧠
 colorFrom: yellow
 colorTo: orange
@@ -9,7 +9,7 @@ fullWidth: true
 pinned: true
 ---
-# CS581 Project - Reinforcement Learning: From Dynamic Programming to Monte-Carlo
 [Google Slides](https://docs.google.com/presentation/d/1v4WwBQKoPnGiyCMXgUs-pCCJ8IwZqM3thUf-Ky00eTQ/edit?usp=sharing)
@@ -48,45 +48,51 @@ Running on local URL:  http://127.0.0.1:7860
 TODO
-**DP Usage:**
-```bash
-TODO
-```
 ## 2.2. Monte-Carlo Agent
 This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
 The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
-Off-policy methods using importance sampling are not implemented for this project.
-Parameter testing results:
-- `run_tests_MC_CliffWalking-v0.sh` (n_train_episodes=2500 and max_steps=200)
-  - Best Update Type: first_visit
-  - Best Gamma: 1.0
-  - Best Epsilon: 0.4
-- `run_tests_MC_FrozenLake-v1.sh` (n_train_episodes=10000 and max_steps=200)
-  - Best Update Type: first_visit
-  - Best Gamma: 1.0
-  - Best Epsilon: 0.4
 ```bash
 # Training: Policy will be saved as a `.npy` file.
-python3 run.py --agent "MCAgent" --train
 # Testing: Use the `--test` flag with the path to the policy file.
-python3 run.py --agent "MCAgent" --test "policies/MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy" --render_mode human
-```
-**MC Usage**
-```bash
-usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}] [--save_dir SAVE_DIR] [--no_save]
-                          [--gamma GAMMA] [--epsilon EPSILON] [--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}] [--render_mode RENDER_MODE] [--wandb_project WANDB_PROJECT] [--wandb_group WANDB_GROUP] [--wandb_job_type WANDB_JOB_TYPE]
-                          [--wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX]
 options:
   -h, --help            show this help message and exit
@@ -100,22 +106,24 @@ options:
                         During training, test the agent every n episodes. (default: 100)
   --max_steps MAX_STEPS
                         The maximum number of steps per episode before the episode is forced to end. (default: 200)
-  --update_type {first_visit,every_visit}
-                        The type of update to use. (default: first_visit)
-  --save_dir SAVE_DIR   The directory to save the policy to. (default: policies)
-  --no_save             Use this flag to disable saving the policy.
-  --gamma GAMMA         The value for the discount factor to use. (default: 1.0)
   --epsilon EPSILON     The value for the epsilon-greedy policy to use. (default: 0.4)
   --env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}
                         The Gymnasium environment to use. (default: CliffWalking-v0)
   --render_mode RENDER_MODE
                         Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)
   --wandb_project WANDB_PROJECT
                         WandB project name for logging. If not provided, no logging is done. (default: None)
-  --wandb_group WANDB_GROUP
-                        WandB group name for logging. (default: monte-carlo)
   --wandb_job_type WANDB_JOB_TYPE
                         WandB job type for logging. (default: train)
-  --wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
-                        WandB run name suffix for logging. (default: None)
 ```

 ---
+title: CS581 Final Project Demo - Dynamic Programming & Monte-Carlo RL Methods
 emoji: 🧠
 colorFrom: yellow
 colorTo: orange
 pinned: true
 ---
+# CS581 Final Project - Dynamic Programming & Monte-Carlo RL Methods
 [Google Slides](https://docs.google.com/presentation/d/1v4WwBQKoPnGiyCMXgUs-pCCJ8IwZqM3thUf-Ky00eTQ/edit?usp=sharing)
 TODO
 ## 2.2. Monte-Carlo Agent
 This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
 The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
+### Parameter testing results
+**CliffWalking-v0**
+<table>
+  <tr>
+    <td><img src="./plots/MC/MCAgent_CliffWalking-v0_gammas.png"/></td>
+    <td><img src="./plots/MC/MCAgent_CliffWalking-v0_epsilons.png"/></td>
+  </tr>
+</table>
+**FrozenLake-v1**
+<table>
+  <tr>
+    <td><img src="./plots/MC/MCAgent_FrozenLake-v1_gammas.png"/></td>
+    <td><img src="./plots/MC/MCAgent_FrozenLake-v1_epsilons.png"/></td>
+  </tr>
+</table>
+**Taxi-v3**
+<table>
+  <tr>
+    <td><img src="./plots/MC/MCAgent_Taxi-v3_gammas.png"/></td>
+    <td><img src="./plots/MC/MCAgent_Taxi-v3_epsilons.png"/></td>
+  </tr>
+</table>
+# 3. Run Script Usage
 ```bash
 # Training: Policy will be saved as a `.npy` file.
+❯ python3 run.py --agent "MCAgent" --train
 # Testing: Use the `--test` flag with the path to the policy file.
+❯ python3 run.py --agent "MCAgent" --test "./policies/[saved_policy_file].npy" --render_mode human
+❯ python3 run.py --help
+usage: run.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] --agent {MCAgent,DPAgent} [--gamma GAMMA] [--epsilon EPSILON] [--update_type {first_visit,every_visit}]
+              [--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}] [--seed SEED] [--size SIZE] [--render_mode RENDER_MODE] [--save_dir SAVE_DIR] [--no_save] [--run_name_suffix RUN_NAME_SUFFIX] [--wandb_project WANDB_PROJECT] [--wandb_job_type WANDB_JOB_TYPE]
 options:
   -h, --help            show this help message and exit
                         During training, test the agent every n episodes. (default: 100)
   --max_steps MAX_STEPS
                         The maximum number of steps per episode before the episode is forced to end. (default: 200)
+  --agent {MCAgent,DPAgent}
+                        The agent to use. Currently supports one of: ['MCAgent', 'DPAgent']
+  --gamma GAMMA         The value for the discount factor to use. (default: 0.99)
   --epsilon EPSILON     The value for the epsilon-greedy policy to use. (default: 0.4)
+  --update_type {first_visit,every_visit}
+                        The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)
   --env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}
                         The Gymnasium environment to use. (default: CliffWalking-v0)
+  --seed SEED           The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)
+  --size SIZE           The size to use when generating the FrozenLake environment. (default: 8)
   --render_mode RENDER_MODE
                         Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)
+  --save_dir SAVE_DIR   The directory to save the policy to. (default: policies)
+  --no_save             Use this flag to disable saving the policy.
+  --run_name_suffix RUN_NAME_SUFFIX
+                        Run name suffix for logging and policy checkpointing. (default: None)
   --wandb_project WANDB_PROJECT
                         WandB project name for logging. If not provided, no logging is done. (default: None)
   --wandb_job_type WANDB_JOB_TYPE
                         WandB job type for logging. (default: train)
 ```

demo.py CHANGED Viewed

@@ -6,7 +6,7 @@ import gradio as gr
 import scipy.ndimage
 import cv2
-from agents import load_agent
 default_n_test_episodes = 10
 default_max_steps = 500

 import scipy.ndimage
 import cv2
+from utils import load_agent
 default_n_test_episodes = 10
 default_max_steps = 500

run.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import argparse
 import wandb
-from agents import AGENTS_MAP, load_agent
 def main():
@@ -36,7 +36,6 @@ def main():
         default=100,
         help="During training, test the agent every n episodes. (default: 100)",
     )
     parser.add_argument(
         "--max_steps",
         type=int,
@@ -44,41 +43,20 @@ def main():
         help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
     )
-    parser.add_argument(
-        "--update_type",
-        type=str,
-        choices=["first_visit", "every_visit"],
-        default="first_visit",
-        help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
-    )
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        default="policies",
-        help="The directory to save the policy to. (default: policies)",
-    )
-    parser.add_argument(
-        "--no_save",
-        action="store_true",
-        help="Use this flag to disable saving the policy.",
-    )
     ### Agent parameters
     parser.add_argument(
         "--agent",
         type=str,
         required=True,
         choices=AGENTS_MAP.keys(),
-        help=f"The agent to use. One of: {AGENTS_MAP.keys()}",
     )
     parser.add_argument(
         "--gamma",
         type=float,
         default=0.99,
-        help="The value for the discount factor to use. (default: 1.0)",
     )
     parser.add_argument(
         "--epsilon",
@@ -87,6 +65,14 @@ def main():
         help="The value for the epsilon-greedy policy to use. (default: 0.4)",
     )
     ### Environment parameters
     parser.add_argument(
         "--env",
@@ -95,27 +81,43 @@ def main():
         choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
         help="The Gymnasium environment to use. (default: CliffWalking-v0)",
     )
     parser.add_argument(
         "--seed",
         type=int,
         default=None,
         help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
     )
     parser.add_argument(
         "--size",
         type=int,
         default=8,
         help="The size to use when generating the FrozenLake environment. (default: 8)",
     )
     parser.add_argument(
         "--render_mode",
         type=str,
         default=None,
         help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
     )
     parser.add_argument(
         "--wandb_project",
         type=str,
@@ -128,12 +130,6 @@ def main():
         default="train",
         help="WandB job type for logging. (default: train)",
     )
-    parser.add_argument(
-        "--wandb_run_name_suffix",
-        type=str,
-        default=None,
-        help="WandB run name suffix for logging. (default: None)",
-    )
     args = parser.parse_args()
     print(vars(args))
@@ -143,8 +139,8 @@ def main():
     )
     agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
-    if args.wandb_run_name_suffix is not None:
-        agent.run_name += f"+{args.wandb_run_name_suffix}"
     try:
         if args.train:

 import argparse
 import wandb
+from utils import AGENTS_MAP, load_agent
 def main():
         default=100,
         help="During training, test the agent every n episodes. (default: 100)",
     )
     parser.add_argument(
         "--max_steps",
         type=int,
         help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
     )
     ### Agent parameters
     parser.add_argument(
         "--agent",
         type=str,
         required=True,
         choices=AGENTS_MAP.keys(),
+        help=f"The agent to use. Currently supports one of: {list(AGENTS_MAP.keys())}",
     )
     parser.add_argument(
         "--gamma",
         type=float,
         default=0.99,
+        help="The value for the discount factor to use. (default: 0.99)",
     )
     parser.add_argument(
         "--epsilon",
         help="The value for the epsilon-greedy policy to use. (default: 0.4)",
     )
+    parser.add_argument(
+        "--update_type",
+        type=str,
+        choices=["first_visit", "every_visit"],
+        default="first_visit",
+        help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
+    )
     ### Environment parameters
     parser.add_argument(
         "--env",
         choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
         help="The Gymnasium environment to use. (default: CliffWalking-v0)",
     )
     parser.add_argument(
         "--seed",
         type=int,
         default=None,
         help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
     )
     parser.add_argument(
         "--size",
         type=int,
         default=8,
         help="The size to use when generating the FrozenLake environment. (default: 8)",
     )
     parser.add_argument(
         "--render_mode",
         type=str,
         default=None,
         help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
     )
+    # Logging and saving parameters
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="policies",
+        help="The directory to save the policy to. (default: policies)",
+    )
+    parser.add_argument(
+        "--no_save",
+        action="store_true",
+        help="Use this flag to disable saving the policy.",
+    )
+    parser.add_argument(
+        "--run_name_suffix",
+        type=str,
+        default=None,
+        help="Run name suffix for logging and policy checkpointing. (default: None)",
+    )
     parser.add_argument(
         "--wandb_project",
         type=str,
         default="train",
         help="WandB job type for logging. (default: train)",
     )
     args = parser.parse_args()
     print(vars(args))
     )
     agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
+    if args.run_name_suffix is not None:
+        agent.run_name += f"+{args.run_name_suffix}"
     try:
         if args.train:

run_tests_MC_CliffWalking-v0.py DELETED Viewed

@@ -1,34 +0,0 @@
-import os
-import multiprocessing
-import random
-wandb_project = "cs581"
-env = "CliffWalking-v0"
-n_train_episodes = 2500
-max_steps = 200
-num_tests = 10
-vals_update_type = [
-    "first_visit"
-]  # Every visit takes too long due to this environment's reward structure
-vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
-vals_gamma = [1.0, 0.98, 0.96, 0.94]
-def run_test(args):
-    os.system(
-        f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
-    )
-with multiprocessing.Pool(8) as p:
-    tests = []
-    for update_type in vals_update_type:
-        for gamma in vals_gamma:
-            for eps in vals_epsilon:
-                tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
-    random.shuffle(tests)
-    p.map(run_test, tests)

run_tests_MC_FrozenLake-v1.py DELETED Viewed

@@ -1,34 +0,0 @@
-import os
-import multiprocessing
-import random
-wandb_project = "cs581"
-env = "FrozenLake-v1"
-n_train_episodes = 5000
-max_steps = 200
-num_tests = 10
-vals_update_type = [
-    "first_visit"
-]  # Every visit takes too long due to this environment's reward structure
-vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
-vals_gamma = [1.0, 0.98, 0.96, 0.94]
-def run_test(args):
-    os.system(
-        f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
-    )
-with multiprocessing.Pool(8) as p:
-    tests = []
-    for update_type in vals_update_type:
-        for gamma in vals_gamma:
-            for eps in vals_epsilon:
-                tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
-    random.shuffle(tests)
-    p.map(run_test, tests)

run_tests_MC_Taxi-v3.py DELETED Viewed

@@ -1,34 +0,0 @@
-import os
-import multiprocessing
-import random
-wandb_project = "cs581"
-env = "Taxi-v3"
-n_train_episodes = 10000
-max_steps = 500
-num_tests = 10
-vals_update_type = [
-    "first_visit"
-]  # Every visit takes too long due to this environment's reward structure
-vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
-vals_gamma = [1.0, 0.98, 0.96, 0.94]
-def run_test(args):
-    os.system(
-        f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
-    )
-with multiprocessing.Pool(8) as p:
-    tests = []
-    for update_type in vals_update_type:
-        for gamma in vals_gamma:
-            for eps in vals_epsilon:
-                tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
-    random.shuffle(tests)
-    p.map(run_test, tests)

test_params.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import argparse
+import os
+import multiprocessing
+import random
+def run(args):
+    env, num_tests, wandb_project = args.env, args.num_tests, args.wandb_project
+    agent = "MCAgent"
+    vals_update_type = [
+        "first_visit"
+    ]  # Note: Every visit takes too long due to these environment's reward structure
+    vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
+    vals_gamma = [1.0, 0.98, 0.96, 0.94]
+    if env == "Taxi-v3":
+        n_train_episodes = 10000
+        max_steps = 500
+    elif env == "FrozenLake-v1":
+        n_train_episodes = 5000
+        max_steps = 200
+    elif env == "CliffWalking-v0":
+        n_train_episodes = 2500
+        max_steps = 200
+    else:
+        raise ValueError(f"Unsupported environment: {env}")
+    def run_test(args):
+        command = f"python3 run.py --train --agent {agent} --env {env}"
+        command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
+        command += f" --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]}"
+        command += f" --run_name_suffix {args[3]}"
+        if wandb_project is not None:
+            command += f" --wandb_project {wandb_project}"
+        command += " --no_save"
+        os.system(command)
+    with multiprocessing.Pool(8) as p:
+        tests = []
+        for update_type in vals_update_type:
+            for gamma in vals_gamma:
+                for eps in vals_epsilon:
+                    tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
+        random.shuffle(tests)
+        p.map(run_test, tests)
+def main():
+    # argument parsing
+    parser = argparse.ArgumentParser(description="Run parameter tests for MC agent")
+        parser.add_argument(
+        "--env",
+        type=str,
+        default="Taxi-v3",
+        help="environment to run",
+    )
+    parser.add_argument(
+        "--num_tests",
+        type=int,
+        default=10,
+        help="number of tests to run for each parameter combination",
+    )
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+        default=None,
+        help="wandb project name to log to",
+    )
+    args = parser.parse_args()
+    run(args)

agents.py → utils.py RENAMED Viewed

@@ -8,36 +8,47 @@ AGENTS_MAP = {"MCAgent": MCAgent, "DPAgent": DPAgent}
 def load_agent(agent_key, **kwargs):
     agent_policy_file = agent_key if agent_key.endswith(".npy") else None
     if agent_policy_file is not None:
         props = os.path.basename(agent_key).split("_")
         try:
             agent_key, env_key = props[0], props[1]
-            agent_args = {}
             for prop in props[2:]:
                 props_split = prop.split(":")
                 if len(props_split) == 2:
-                    agent_args[props_split[0]] = props_split[1]
                 else:
                     warnings.warn(
                         f"Skipping property {prop} as it does not have the format 'key:value'.",
                         UserWarning,
                     )
-            agent_args["env"] = env_key
-            kwargs.update(agent_args)
             print("agent_args:", kwargs)
-        except IndexError:
             raise ValueError(
-                f"ERROR: Could not parse agent properties. Must be of the format 'AgentName_EnvName_key:value_key:value...'."
-            )
     if agent_key not in AGENTS_MAP:
         raise ValueError(
             f"ERROR: Agent '{agent_key}' not valid. Must be one of: {AGENTS_MAP.keys()}"
         )
     agent = AGENTS_MAP[agent_key](**kwargs)
     if agent_policy_file is not None:
         agent.load_policy(agent_policy_file)

 def load_agent(agent_key, **kwargs):
+    """
+    Loads an agent from a file or from the AGENTS_MAP.
+    :param agent_key: Which agent to load. Can be a key in AGENTS_MAP or a path to a policy file ending with ".npy".
+                      If a policy file is provided, the agent name, environment name, and other parameters will be parsed from the file name.
+    :param kwargs: Additional arguments to pass to the agent constructor. If loading from a policy file, any conflicting arguments will be overwritten.
+    """
     agent_policy_file = agent_key if agent_key.endswith(".npy") else None
+    # if loading from a policy file, parse the agent key, environment key, and other parameters from the file name
     if agent_policy_file is not None:
         props = os.path.basename(agent_key).split("_")
         try:
+            # Parsing arguments from file name
             agent_key, env_key = props[0], props[1]
+            parsed_args = {}
             for prop in props[2:]:
                 props_split = prop.split(":")
                 if len(props_split) == 2:
+                    parsed_args[props_split[0]] = props_split[1]
                 else:
                     warnings.warn(
                         f"Skipping property {prop} as it does not have the format 'key:value'.",
                         UserWarning,
                     )
+            # Overwrite any conflicting arguments with those from the file name
+            parsed_args["env"] = env_key
+            kwargs |= parsed_args
             print("agent_args:", kwargs)
+        except IndexError as e:
             raise ValueError(
+                "ERROR: Could not parse agent properties. Must be of the format 'AgentName_EnvName_key:value_key:value...'."
+            ) from e
+    # Check if agent key is valid
     if agent_key not in AGENTS_MAP:
         raise ValueError(
             f"ERROR: Agent '{agent_key}' not valid. Must be one of: {AGENTS_MAP.keys()}"
         )
+    # Load agent based on key and arguments
     agent = AGENTS_MAP[agent_key](**kwargs)
+    # If loading from a policy file, load the policy into the agent
     if agent_policy_file is not None:
         agent.load_policy(agent_policy_file)