Andrei Cozma commited on
Commit
cb18290
·
1 Parent(s): fe4182f
MonteCarloAgent.py CHANGED
@@ -178,7 +178,7 @@ def main():
178
  parser.add_argument(
179
  "--train",
180
  action="store_true",
181
- help="Use this flag to train the agent. (default: False)",
182
  )
183
  parser.add_argument(
184
  "--test",
@@ -190,26 +190,26 @@ def main():
190
  "--n_train_episodes",
191
  type=int,
192
  default=2000,
193
- help="The number of episodes to train for.",
194
  )
195
  parser.add_argument(
196
  "--n_test_episodes",
197
  type=int,
198
  default=100,
199
- help="The number of episodes to test for.",
200
  )
201
  parser.add_argument(
202
  "--test_every",
203
  type=int,
204
  default=100,
205
- help="During training, test the agent every n episodes.",
206
  )
207
 
208
  parser.add_argument(
209
  "--max_steps",
210
  type=int,
211
  default=500,
212
- help="The maximum number of steps per episode before the episode is forced to end.",
213
  )
214
 
215
  ### Agent parameters
@@ -217,13 +217,13 @@ def main():
217
  "--gamma",
218
  type=float,
219
  default=0.99,
220
- help="The value for the discount factor to use.",
221
  )
222
  parser.add_argument(
223
  "--epsilon",
224
  type=float,
225
  default=0.1,
226
- help="The value for the epsilon-greedy policy to use.",
227
  )
228
 
229
  ### Environment parameters
@@ -231,19 +231,19 @@ def main():
231
  "--env",
232
  type=str,
233
  default="CliffWalking-v0",
234
- help="The Gymnasium environment to use.",
235
  )
236
  parser.add_argument(
237
  "--render_mode",
238
  type=str,
239
  default=None,
240
- help="The render mode to use. By default, no rendering is done. To render the environment, set this to 'human'.",
241
  )
242
  parser.add_argument(
243
  "--wandb_project",
244
  type=str,
245
  default=None,
246
- help="WandB project name for logging. If not provided, no logging is done.",
247
  )
248
  parser.add_argument(
249
  "--wandb_group",
 
178
  parser.add_argument(
179
  "--train",
180
  action="store_true",
181
+ help="Use this flag to train the agent.",
182
  )
183
  parser.add_argument(
184
  "--test",
 
190
  "--n_train_episodes",
191
  type=int,
192
  default=2000,
193
+ help="The number of episodes to train for. (default: 2000)",
194
  )
195
  parser.add_argument(
196
  "--n_test_episodes",
197
  type=int,
198
  default=100,
199
+ help="The number of episodes to test for. (default: 100)",
200
  )
201
  parser.add_argument(
202
  "--test_every",
203
  type=int,
204
  default=100,
205
+ help="During training, test the agent every n episodes. (default: 100)",
206
  )
207
 
208
  parser.add_argument(
209
  "--max_steps",
210
  type=int,
211
  default=500,
212
+ help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
213
  )
214
 
215
  ### Agent parameters
 
217
  "--gamma",
218
  type=float,
219
  default=0.99,
220
+ help="The value for the discount factor to use. (default: 0.99)",
221
  )
222
  parser.add_argument(
223
  "--epsilon",
224
  type=float,
225
  default=0.1,
226
+ help="The value for the epsilon-greedy policy to use. (default: 0.1)",
227
  )
228
 
229
  ### Environment parameters
 
231
  "--env",
232
  type=str,
233
  default="CliffWalking-v0",
234
+ help="The Gymnasium environment to use. (default: CliffWalking-v0)",
235
  )
236
  parser.add_argument(
237
  "--render_mode",
238
  type=str,
239
  default=None,
240
+ help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
241
  )
242
  parser.add_argument(
243
  "--wandb_project",
244
  type=str,
245
  default=None,
246
+ help="WandB project name for logging. If not provided, no logging is done. (default: None)",
247
  )
248
  parser.add_argument(
249
  "--wandb_group",
README.md CHANGED
@@ -4,6 +4,63 @@
4
 
5
  Evolution of Reinforcement Learning methods from pure Dynamic Programming-based methods to Monte Carlo methods + Bellman Optimization Comparison
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ## Presentation Guide (Text Version)
8
 
9
  1. Title Slide: list the title of your talk along with your name
 
4
 
5
  Evolution of Reinforcement Learning methods from pure Dynamic Programming-based methods to Monte Carlo methods + Bellman Optimization Comparison
6
 
7
+ ## Monte-Carlo Agent
8
+
9
+ The implementation of the epsilon-greedy Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
10
+
11
+ ### Training
12
+
13
+ ```bash
14
+ python3 MonteCarloAgent.py --train
15
+ ```
16
+
17
+ The final policy will be saved to a `.npy` file.
18
+
19
+ ### Testing
20
+
21
+ Provide the path to the policy file as an argument to the `--test` flag.
22
+
23
+ ```bash
24
+ python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy
25
+ ```
26
+
27
+ ### Visualization
28
+
29
+ ```bash
30
+ python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
31
+ ```
32
+
33
+ ### Default Parameters
34
+
35
+ ```python
36
+ usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--gamma GAMMA] [--epsilon EPSILON] [--env ENV]
37
+ [--render_mode RENDER_MODE] [--wandb_project WANDB_PROJECT] [--wandb_group WANDB_GROUP] [--wandb_job_type WANDB_JOB_TYPE]
38
+
39
+ options:
40
+ -h, --help show this help message and exit
41
+ --train Use this flag to train the agent. (default: False)
42
+ --test TEST Use this flag to test the agent. Provide the path to the policy file.
43
+ --n_train_episodes N_TRAIN_EPISODES
44
+ The number of episodes to train for.
45
+ --n_test_episodes N_TEST_EPISODES
46
+ The number of episodes to test for.
47
+ --test_every TEST_EVERY
48
+ During training, test the agent every n episodes.
49
+ --max_steps MAX_STEPS
50
+ The maximum number of steps per episode before the episode is forced to end.
51
+ --gamma GAMMA The value for the discount factor to use.
52
+ --epsilon EPSILON The value for the epsilon-greedy policy to use.
53
+ --env ENV The Gymnasium environment to use.
54
+ --render_mode RENDER_MODE
55
+ The render mode to use. By default, no rendering is done. To render the environment, set this to 'human'.
56
+ --wandb_project WANDB_PROJECT
57
+ WandB project name for logging. If not provided, no logging is done.
58
+ --wandb_group WANDB_GROUP
59
+ WandB group name for logging. (default: monte-carlo)
60
+ --wandb_job_type WANDB_JOB_TYPE
61
+ WandB job type for logging. (default: train)
62
+ ```
63
+
64
  ## Presentation Guide (Text Version)
65
 
66
  1. Title Slide: list the title of your talk along with your name
policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy ADDED
Binary file (1.66 kB). View file