Andrei Cozma commited on
Commit
46b0409
·
1 Parent(s): ebd7110
Shared.py → AgentBase.py RENAMED
@@ -4,7 +4,7 @@ import gymnasium as gym
4
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
5
 
6
 
7
- class Shared:
8
  def __init__(
9
  self,
10
  /,
 
4
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
5
 
6
 
7
+ class AgentBase:
8
  def __init__(
9
  self,
10
  /,
DPAgent.py CHANGED
@@ -3,11 +3,11 @@ import numpy as np
3
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
4
  from matplotlib import pyplot as plt
5
  from tqdm import trange
6
- from Shared import Shared
7
  import warnings
8
 
9
 
10
- class DPAgent(Shared):
11
  def __init__(self, /, **kwargs):
12
  super().__init__(run_name=self.__class__.__name__, **kwargs)
13
  self.theta = kwargs.get("theta", 1e-10)
@@ -36,7 +36,10 @@ class DPAgent(Shared):
36
  for probability, next_state, reward, done in self.env.P[state][
37
  action
38
  ]:
39
- if self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n-1:
 
 
 
40
  reward = 1
41
  expected_value += probability * (
42
  reward + self.gamma * self.V[next_state]
@@ -53,14 +56,17 @@ class DPAgent(Shared):
53
  # if i % 5 == 0 and i != 0:
54
  # self.test(verbose=False)
55
  print(f"Iteration {i}: delta={delta}")
56
-
57
  self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
58
  for s in range(self.env.observation_space.n):
59
  for a in range(self.env.action_space.n):
60
  expected_value = 0
61
  for probability, next_state, reward, done in self.env.P[s][a]:
62
- if self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n-1:
63
- reward = 1
 
 
 
64
  expected_value += probability * (
65
  reward + self.gamma * self.V[next_state]
66
  )
 
3
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
4
  from matplotlib import pyplot as plt
5
  from tqdm import trange
6
+ from AgentBase import AgentBase
7
  import warnings
8
 
9
 
10
+ class DPAgent(AgentBase):
11
  def __init__(self, /, **kwargs):
12
  super().__init__(run_name=self.__class__.__name__, **kwargs)
13
  self.theta = kwargs.get("theta", 1e-10)
 
36
  for probability, next_state, reward, done in self.env.P[state][
37
  action
38
  ]:
39
+ if (
40
+ self.env_name == "CliffWalking-v0"
41
+ and state == self.env.observation_space.n - 1
42
+ ):
43
  reward = 1
44
  expected_value += probability * (
45
  reward + self.gamma * self.V[next_state]
 
56
  # if i % 5 == 0 and i != 0:
57
  # self.test(verbose=False)
58
  print(f"Iteration {i}: delta={delta}")
59
+
60
  self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
61
  for s in range(self.env.observation_space.n):
62
  for a in range(self.env.action_space.n):
63
  expected_value = 0
64
  for probability, next_state, reward, done in self.env.P[s][a]:
65
+ if (
66
+ self.env_name == "CliffWalking-v0"
67
+ and state == self.env.observation_space.n - 1
68
+ ):
69
+ reward = 1
70
  expected_value += probability * (
71
  reward + self.gamma * self.V[next_state]
72
  )
MCAgent.py CHANGED
@@ -1,11 +1,10 @@
1
  import numpy as np
2
  from tqdm import tqdm
3
- from Shared import Shared
4
  import wandb
5
- from Shared import Shared
6
 
7
 
8
- class MCAgent(Shared):
9
  def __init__(self, /, **kwargs):
10
  super().__init__(run_name=self.__class__.__name__, **kwargs)
11
  self.reset()
 
1
  import numpy as np
2
  from tqdm import tqdm
 
3
  import wandb
4
+ from AgentBase import AgentBase
5
 
6
 
7
+ class MCAgent(AgentBase):
8
  def __init__(self, /, **kwargs):
9
  super().__init__(run_name=self.__class__.__name__, **kwargs)
10
  self.reset()
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Reinforcement Learning - From Dynamic Programming to Monte-Carlo
3
  emoji: 🧠
4
  colorFrom: yellow
5
  colorTo: orange
@@ -9,7 +9,7 @@ fullWidth: true
9
  pinned: true
10
  ---
11
 
12
- # CS581 Project - Reinforcement Learning: From Dynamic Programming to Monte-Carlo
13
 
14
  [Google Slides](https://docs.google.com/presentation/d/1v4WwBQKoPnGiyCMXgUs-pCCJ8IwZqM3thUf-Ky00eTQ/edit?usp=sharing)
15
 
@@ -48,45 +48,51 @@ Running on local URL: http://127.0.0.1:7860
48
 
49
  TODO
50
 
51
- **DP Usage:**
52
-
53
- ```bash
54
- TODO
55
- ```
56
-
57
  ## 2.2. Monte-Carlo Agent
58
 
59
  This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
60
 
61
  The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
62
 
63
- Off-policy methods using importance sampling are not implemented for this project.
64
 
65
- Parameter testing results:
66
 
67
- - `run_tests_MC_CliffWalking-v0.sh` (n_train_episodes=2500 and max_steps=200)
68
- - Best Update Type: first_visit
69
- - Best Gamma: 1.0
70
- - Best Epsilon: 0.4
71
- - `run_tests_MC_FrozenLake-v1.sh` (n_train_episodes=10000 and max_steps=200)
72
- - Best Update Type: first_visit
73
- - Best Gamma: 1.0
74
- - Best Epsilon: 0.4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  ```bash
77
  # Training: Policy will be saved as a `.npy` file.
78
- python3 run.py --agent "MCAgent" --train
79
 
80
  # Testing: Use the `--test` flag with the path to the policy file.
81
- python3 run.py --agent "MCAgent" --test "policies/MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy" --render_mode human
82
- ```
83
 
84
- **MC Usage**
85
-
86
- ```bash
87
- usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}] [--save_dir SAVE_DIR] [--no_save]
88
- [--gamma GAMMA] [--epsilon EPSILON] [--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}] [--render_mode RENDER_MODE] [--wandb_project WANDB_PROJECT] [--wandb_group WANDB_GROUP] [--wandb_job_type WANDB_JOB_TYPE]
89
- [--wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX]
90
 
91
  options:
92
  -h, --help show this help message and exit
@@ -100,22 +106,24 @@ options:
100
  During training, test the agent every n episodes. (default: 100)
101
  --max_steps MAX_STEPS
102
  The maximum number of steps per episode before the episode is forced to end. (default: 200)
103
- --update_type {first_visit,every_visit}
104
- The type of update to use. (default: first_visit)
105
- --save_dir SAVE_DIR The directory to save the policy to. (default: policies)
106
- --no_save Use this flag to disable saving the policy.
107
- --gamma GAMMA The value for the discount factor to use. (default: 1.0)
108
  --epsilon EPSILON The value for the epsilon-greedy policy to use. (default: 0.4)
 
 
109
  --env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}
110
  The Gymnasium environment to use. (default: CliffWalking-v0)
 
 
111
  --render_mode RENDER_MODE
112
  Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)
 
 
 
 
113
  --wandb_project WANDB_PROJECT
114
  WandB project name for logging. If not provided, no logging is done. (default: None)
115
- --wandb_group WANDB_GROUP
116
- WandB group name for logging. (default: monte-carlo)
117
  --wandb_job_type WANDB_JOB_TYPE
118
  WandB job type for logging. (default: train)
119
- --wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
120
- WandB run name suffix for logging. (default: None)
121
  ```
 
1
  ---
2
+ title: CS581 Final Project Demo - Dynamic Programming & Monte-Carlo RL Methods
3
  emoji: 🧠
4
  colorFrom: yellow
5
  colorTo: orange
 
9
  pinned: true
10
  ---
11
 
12
+ # CS581 Final Project - Dynamic Programming & Monte-Carlo RL Methods
13
 
14
  [Google Slides](https://docs.google.com/presentation/d/1v4WwBQKoPnGiyCMXgUs-pCCJ8IwZqM3thUf-Ky00eTQ/edit?usp=sharing)
15
 
 
48
 
49
  TODO
50
 
 
 
 
 
 
 
51
  ## 2.2. Monte-Carlo Agent
52
 
53
  This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
54
 
55
  The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
56
 
57
+ ### Parameter testing results
58
 
59
+ **CliffWalking-v0**
60
 
61
+ <table>
62
+ <tr>
63
+ <td><img src="./plots/MC/MCAgent_CliffWalking-v0_gammas.png"/></td>
64
+ <td><img src="./plots/MC/MCAgent_CliffWalking-v0_epsilons.png"/></td>
65
+ </tr>
66
+ </table>
67
+
68
+ **FrozenLake-v1**
69
+ <table>
70
+ <tr>
71
+ <td><img src="./plots/MC/MCAgent_FrozenLake-v1_gammas.png"/></td>
72
+ <td><img src="./plots/MC/MCAgent_FrozenLake-v1_epsilons.png"/></td>
73
+ </tr>
74
+ </table>
75
+
76
+ **Taxi-v3**
77
+ <table>
78
+ <tr>
79
+ <td><img src="./plots/MC/MCAgent_Taxi-v3_gammas.png"/></td>
80
+ <td><img src="./plots/MC/MCAgent_Taxi-v3_epsilons.png"/></td>
81
+ </tr>
82
+ </table>
83
+
84
+ # 3. Run Script Usage
85
 
86
  ```bash
87
  # Training: Policy will be saved as a `.npy` file.
88
+ python3 run.py --agent "MCAgent" --train
89
 
90
  # Testing: Use the `--test` flag with the path to the policy file.
91
+ python3 run.py --agent "MCAgent" --test "./policies/[saved_policy_file].npy" --render_mode human
 
92
 
93
+ python3 run.py --help
94
+ usage: run.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] --agent {MCAgent,DPAgent} [--gamma GAMMA] [--epsilon EPSILON] [--update_type {first_visit,every_visit}]
95
+ [--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}] [--seed SEED] [--size SIZE] [--render_mode RENDER_MODE] [--save_dir SAVE_DIR] [--no_save] [--run_name_suffix RUN_NAME_SUFFIX] [--wandb_project WANDB_PROJECT] [--wandb_job_type WANDB_JOB_TYPE]
 
 
 
96
 
97
  options:
98
  -h, --help show this help message and exit
 
106
  During training, test the agent every n episodes. (default: 100)
107
  --max_steps MAX_STEPS
108
  The maximum number of steps per episode before the episode is forced to end. (default: 200)
109
+ --agent {MCAgent,DPAgent}
110
+ The agent to use. Currently supports one of: ['MCAgent', 'DPAgent']
111
+ --gamma GAMMA The value for the discount factor to use. (default: 0.99)
 
 
112
  --epsilon EPSILON The value for the epsilon-greedy policy to use. (default: 0.4)
113
+ --update_type {first_visit,every_visit}
114
+ The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)
115
  --env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}
116
  The Gymnasium environment to use. (default: CliffWalking-v0)
117
+ --seed SEED The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)
118
+ --size SIZE The size to use when generating the FrozenLake environment. (default: 8)
119
  --render_mode RENDER_MODE
120
  Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)
121
+ --save_dir SAVE_DIR The directory to save the policy to. (default: policies)
122
+ --no_save Use this flag to disable saving the policy.
123
+ --run_name_suffix RUN_NAME_SUFFIX
124
+ Run name suffix for logging and policy checkpointing. (default: None)
125
  --wandb_project WANDB_PROJECT
126
  WandB project name for logging. If not provided, no logging is done. (default: None)
 
 
127
  --wandb_job_type WANDB_JOB_TYPE
128
  WandB job type for logging. (default: train)
 
 
129
  ```
demo.py CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
6
  import scipy.ndimage
7
  import cv2
8
 
9
- from agents import load_agent
10
 
11
  default_n_test_episodes = 10
12
  default_max_steps = 500
 
6
  import scipy.ndimage
7
  import cv2
8
 
9
+ from utils import load_agent
10
 
11
  default_n_test_episodes = 10
12
  default_max_steps = 500
run.py CHANGED
@@ -1,7 +1,7 @@
1
  import argparse
2
  import wandb
3
 
4
- from agents import AGENTS_MAP, load_agent
5
 
6
 
7
  def main():
@@ -36,7 +36,6 @@ def main():
36
  default=100,
37
  help="During training, test the agent every n episodes. (default: 100)",
38
  )
39
-
40
  parser.add_argument(
41
  "--max_steps",
42
  type=int,
@@ -44,41 +43,20 @@ def main():
44
  help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
45
  )
46
 
47
- parser.add_argument(
48
- "--update_type",
49
- type=str,
50
- choices=["first_visit", "every_visit"],
51
- default="first_visit",
52
- help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
53
- )
54
-
55
- parser.add_argument(
56
- "--save_dir",
57
- type=str,
58
- default="policies",
59
- help="The directory to save the policy to. (default: policies)",
60
- )
61
-
62
- parser.add_argument(
63
- "--no_save",
64
- action="store_true",
65
- help="Use this flag to disable saving the policy.",
66
- )
67
-
68
  ### Agent parameters
69
  parser.add_argument(
70
  "--agent",
71
  type=str,
72
  required=True,
73
  choices=AGENTS_MAP.keys(),
74
- help=f"The agent to use. One of: {AGENTS_MAP.keys()}",
75
  )
76
 
77
  parser.add_argument(
78
  "--gamma",
79
  type=float,
80
  default=0.99,
81
- help="The value for the discount factor to use. (default: 1.0)",
82
  )
83
  parser.add_argument(
84
  "--epsilon",
@@ -87,6 +65,14 @@ def main():
87
  help="The value for the epsilon-greedy policy to use. (default: 0.4)",
88
  )
89
 
 
 
 
 
 
 
 
 
90
  ### Environment parameters
91
  parser.add_argument(
92
  "--env",
@@ -95,27 +81,43 @@ def main():
95
  choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
96
  help="The Gymnasium environment to use. (default: CliffWalking-v0)",
97
  )
98
-
99
  parser.add_argument(
100
  "--seed",
101
  type=int,
102
  default=None,
103
  help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
104
  )
105
-
106
  parser.add_argument(
107
  "--size",
108
  type=int,
109
  default=8,
110
  help="The size to use when generating the FrozenLake environment. (default: 8)",
111
  )
112
-
113
  parser.add_argument(
114
  "--render_mode",
115
  type=str,
116
  default=None,
117
  help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
118
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  parser.add_argument(
120
  "--wandb_project",
121
  type=str,
@@ -128,12 +130,6 @@ def main():
128
  default="train",
129
  help="WandB job type for logging. (default: train)",
130
  )
131
- parser.add_argument(
132
- "--wandb_run_name_suffix",
133
- type=str,
134
- default=None,
135
- help="WandB run name suffix for logging. (default: None)",
136
- )
137
 
138
  args = parser.parse_args()
139
  print(vars(args))
@@ -143,8 +139,8 @@ def main():
143
  )
144
 
145
  agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
146
- if args.wandb_run_name_suffix is not None:
147
- agent.run_name += f"+{args.wandb_run_name_suffix}"
148
 
149
  try:
150
  if args.train:
 
1
  import argparse
2
  import wandb
3
 
4
+ from utils import AGENTS_MAP, load_agent
5
 
6
 
7
  def main():
 
36
  default=100,
37
  help="During training, test the agent every n episodes. (default: 100)",
38
  )
 
39
  parser.add_argument(
40
  "--max_steps",
41
  type=int,
 
43
  help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
44
  )
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ### Agent parameters
47
  parser.add_argument(
48
  "--agent",
49
  type=str,
50
  required=True,
51
  choices=AGENTS_MAP.keys(),
52
+ help=f"The agent to use. Currently supports one of: {list(AGENTS_MAP.keys())}",
53
  )
54
 
55
  parser.add_argument(
56
  "--gamma",
57
  type=float,
58
  default=0.99,
59
+ help="The value for the discount factor to use. (default: 0.99)",
60
  )
61
  parser.add_argument(
62
  "--epsilon",
 
65
  help="The value for the epsilon-greedy policy to use. (default: 0.4)",
66
  )
67
 
68
+ parser.add_argument(
69
+ "--update_type",
70
+ type=str,
71
+ choices=["first_visit", "every_visit"],
72
+ default="first_visit",
73
+ help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
74
+ )
75
+
76
  ### Environment parameters
77
  parser.add_argument(
78
  "--env",
 
81
  choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
82
  help="The Gymnasium environment to use. (default: CliffWalking-v0)",
83
  )
 
84
  parser.add_argument(
85
  "--seed",
86
  type=int,
87
  default=None,
88
  help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
89
  )
 
90
  parser.add_argument(
91
  "--size",
92
  type=int,
93
  default=8,
94
  help="The size to use when generating the FrozenLake environment. (default: 8)",
95
  )
 
96
  parser.add_argument(
97
  "--render_mode",
98
  type=str,
99
  default=None,
100
  help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
101
  )
102
+
103
+ # Logging and saving parameters
104
+ parser.add_argument(
105
+ "--save_dir",
106
+ type=str,
107
+ default="policies",
108
+ help="The directory to save the policy to. (default: policies)",
109
+ )
110
+ parser.add_argument(
111
+ "--no_save",
112
+ action="store_true",
113
+ help="Use this flag to disable saving the policy.",
114
+ )
115
+ parser.add_argument(
116
+ "--run_name_suffix",
117
+ type=str,
118
+ default=None,
119
+ help="Run name suffix for logging and policy checkpointing. (default: None)",
120
+ )
121
  parser.add_argument(
122
  "--wandb_project",
123
  type=str,
 
130
  default="train",
131
  help="WandB job type for logging. (default: train)",
132
  )
 
 
 
 
 
 
133
 
134
  args = parser.parse_args()
135
  print(vars(args))
 
139
  )
140
 
141
  agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
142
+ if args.run_name_suffix is not None:
143
+ agent.run_name += f"+{args.run_name_suffix}"
144
 
145
  try:
146
  if args.train:
run_tests_MC_CliffWalking-v0.py DELETED
@@ -1,34 +0,0 @@
1
- import os
2
- import multiprocessing
3
- import random
4
-
5
- wandb_project = "cs581"
6
-
7
- env = "CliffWalking-v0"
8
- n_train_episodes = 2500
9
- max_steps = 200
10
-
11
- num_tests = 10
12
-
13
- vals_update_type = [
14
- "first_visit"
15
- ] # Every visit takes too long due to this environment's reward structure
16
- vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
17
- vals_gamma = [1.0, 0.98, 0.96, 0.94]
18
-
19
-
20
- def run_test(args):
21
- os.system(
22
- f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
23
- )
24
-
25
-
26
- with multiprocessing.Pool(8) as p:
27
- tests = []
28
- for update_type in vals_update_type:
29
- for gamma in vals_gamma:
30
- for eps in vals_epsilon:
31
- tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
32
- random.shuffle(tests)
33
-
34
- p.map(run_test, tests)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
run_tests_MC_FrozenLake-v1.py DELETED
@@ -1,34 +0,0 @@
1
- import os
2
- import multiprocessing
3
- import random
4
-
5
- wandb_project = "cs581"
6
-
7
- env = "FrozenLake-v1"
8
- n_train_episodes = 5000
9
- max_steps = 200
10
-
11
- num_tests = 10
12
-
13
- vals_update_type = [
14
- "first_visit"
15
- ] # Every visit takes too long due to this environment's reward structure
16
- vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
17
- vals_gamma = [1.0, 0.98, 0.96, 0.94]
18
-
19
-
20
- def run_test(args):
21
- os.system(
22
- f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
23
- )
24
-
25
-
26
- with multiprocessing.Pool(8) as p:
27
- tests = []
28
- for update_type in vals_update_type:
29
- for gamma in vals_gamma:
30
- for eps in vals_epsilon:
31
- tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
32
- random.shuffle(tests)
33
-
34
- p.map(run_test, tests)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
run_tests_MC_Taxi-v3.py DELETED
@@ -1,34 +0,0 @@
1
- import os
2
- import multiprocessing
3
- import random
4
-
5
- wandb_project = "cs581"
6
-
7
- env = "Taxi-v3"
8
- n_train_episodes = 10000
9
- max_steps = 500
10
-
11
- num_tests = 10
12
-
13
- vals_update_type = [
14
- "first_visit"
15
- ] # Every visit takes too long due to this environment's reward structure
16
- vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
17
- vals_gamma = [1.0, 0.98, 0.96, 0.94]
18
-
19
-
20
- def run_test(args):
21
- os.system(
22
- f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
23
- )
24
-
25
-
26
- with multiprocessing.Pool(8) as p:
27
- tests = []
28
- for update_type in vals_update_type:
29
- for gamma in vals_gamma:
30
- for eps in vals_epsilon:
31
- tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
32
- random.shuffle(tests)
33
-
34
- p.map(run_test, tests)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_params.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import multiprocessing
4
+ import random
5
+
6
+
7
+ def run(args):
8
+ env, num_tests, wandb_project = args.env, args.num_tests, args.wandb_project
9
+ agent = "MCAgent"
10
+
11
+ vals_update_type = [
12
+ "first_visit"
13
+ ] # Note: Every visit takes too long due to these environment's reward structure
14
+ vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
15
+ vals_gamma = [1.0, 0.98, 0.96, 0.94]
16
+
17
+ if env == "Taxi-v3":
18
+ n_train_episodes = 10000
19
+ max_steps = 500
20
+ elif env == "FrozenLake-v1":
21
+ n_train_episodes = 5000
22
+ max_steps = 200
23
+ elif env == "CliffWalking-v0":
24
+ n_train_episodes = 2500
25
+ max_steps = 200
26
+ else:
27
+ raise ValueError(f"Unsupported environment: {env}")
28
+
29
+
30
+ def run_test(args):
31
+ command = f"python3 run.py --train --agent {agent} --env {env}"
32
+ command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
33
+ command += f" --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]}"
34
+ command += f" --run_name_suffix {args[3]}"
35
+ if wandb_project is not None:
36
+ command += f" --wandb_project {wandb_project}"
37
+ command += " --no_save"
38
+ os.system(command)
39
+
40
+ with multiprocessing.Pool(8) as p:
41
+ tests = []
42
+ for update_type in vals_update_type:
43
+ for gamma in vals_gamma:
44
+ for eps in vals_epsilon:
45
+ tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
46
+ random.shuffle(tests)
47
+
48
+ p.map(run_test, tests)
49
+
50
+
51
+ def main():
52
+ # argument parsing
53
+ parser = argparse.ArgumentParser(description="Run parameter tests for MC agent")
54
+ parser.add_argument(
55
+ "--env",
56
+ type=str,
57
+ default="Taxi-v3",
58
+ help="environment to run",
59
+ )
60
+ parser.add_argument(
61
+ "--num_tests",
62
+ type=int,
63
+ default=10,
64
+ help="number of tests to run for each parameter combination",
65
+ )
66
+ parser.add_argument(
67
+ "--wandb_project",
68
+ type=str,
69
+ default=None,
70
+ help="wandb project name to log to",
71
+ )
72
+
73
+ args = parser.parse_args()
74
+
75
+ run(args)
76
+
agents.py → utils.py RENAMED
@@ -8,36 +8,47 @@ AGENTS_MAP = {"MCAgent": MCAgent, "DPAgent": DPAgent}
8
 
9
 
10
  def load_agent(agent_key, **kwargs):
 
 
 
 
 
 
11
  agent_policy_file = agent_key if agent_key.endswith(".npy") else None
 
12
  if agent_policy_file is not None:
13
  props = os.path.basename(agent_key).split("_")
14
  try:
 
15
  agent_key, env_key = props[0], props[1]
16
- agent_args = {}
17
  for prop in props[2:]:
18
  props_split = prop.split(":")
19
  if len(props_split) == 2:
20
- agent_args[props_split[0]] = props_split[1]
21
  else:
22
  warnings.warn(
23
  f"Skipping property {prop} as it does not have the format 'key:value'.",
24
  UserWarning,
25
  )
26
-
27
- agent_args["env"] = env_key
28
- kwargs.update(agent_args)
29
  print("agent_args:", kwargs)
30
- except IndexError:
31
  raise ValueError(
32
- f"ERROR: Could not parse agent properties. Must be of the format 'AgentName_EnvName_key:value_key:value...'."
33
- )
34
 
 
35
  if agent_key not in AGENTS_MAP:
36
  raise ValueError(
37
  f"ERROR: Agent '{agent_key}' not valid. Must be one of: {AGENTS_MAP.keys()}"
38
  )
39
 
 
40
  agent = AGENTS_MAP[agent_key](**kwargs)
 
41
  if agent_policy_file is not None:
42
  agent.load_policy(agent_policy_file)
43
 
 
8
 
9
 
10
  def load_agent(agent_key, **kwargs):
11
+ """
12
+ Loads an agent from a file or from the AGENTS_MAP.
13
+ :param agent_key: Which agent to load. Can be a key in AGENTS_MAP or a path to a policy file ending with ".npy".
14
+ If a policy file is provided, the agent name, environment name, and other parameters will be parsed from the file name.
15
+ :param kwargs: Additional arguments to pass to the agent constructor. If loading from a policy file, any conflicting arguments will be overwritten.
16
+ """
17
  agent_policy_file = agent_key if agent_key.endswith(".npy") else None
18
+ # if loading from a policy file, parse the agent key, environment key, and other parameters from the file name
19
  if agent_policy_file is not None:
20
  props = os.path.basename(agent_key).split("_")
21
  try:
22
+ # Parsing arguments from file name
23
  agent_key, env_key = props[0], props[1]
24
+ parsed_args = {}
25
  for prop in props[2:]:
26
  props_split = prop.split(":")
27
  if len(props_split) == 2:
28
+ parsed_args[props_split[0]] = props_split[1]
29
  else:
30
  warnings.warn(
31
  f"Skipping property {prop} as it does not have the format 'key:value'.",
32
  UserWarning,
33
  )
34
+ # Overwrite any conflicting arguments with those from the file name
35
+ parsed_args["env"] = env_key
36
+ kwargs |= parsed_args
37
  print("agent_args:", kwargs)
38
+ except IndexError as e:
39
  raise ValueError(
40
+ "ERROR: Could not parse agent properties. Must be of the format 'AgentName_EnvName_key:value_key:value...'."
41
+ ) from e
42
 
43
+ # Check if agent key is valid
44
  if agent_key not in AGENTS_MAP:
45
  raise ValueError(
46
  f"ERROR: Agent '{agent_key}' not valid. Must be one of: {AGENTS_MAP.keys()}"
47
  )
48
 
49
+ # Load agent based on key and arguments
50
  agent = AGENTS_MAP[agent_key](**kwargs)
51
+ # If loading from a policy file, load the policy into the agent
52
  if agent_policy_file is not None:
53
  agent.load_policy(agent_policy_file)
54