Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
cb18290
1
Parent(s):
fe4182f
Updates
Browse files- MonteCarloAgent.py +10 -10
- README.md +57 -0
- policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy +0 -0
MonteCarloAgent.py
CHANGED
@@ -178,7 +178,7 @@ def main():
|
|
178 |
parser.add_argument(
|
179 |
"--train",
|
180 |
action="store_true",
|
181 |
-
help="Use this flag to train the agent.
|
182 |
)
|
183 |
parser.add_argument(
|
184 |
"--test",
|
@@ -190,26 +190,26 @@ def main():
|
|
190 |
"--n_train_episodes",
|
191 |
type=int,
|
192 |
default=2000,
|
193 |
-
help="The number of episodes to train for.",
|
194 |
)
|
195 |
parser.add_argument(
|
196 |
"--n_test_episodes",
|
197 |
type=int,
|
198 |
default=100,
|
199 |
-
help="The number of episodes to test for.",
|
200 |
)
|
201 |
parser.add_argument(
|
202 |
"--test_every",
|
203 |
type=int,
|
204 |
default=100,
|
205 |
-
help="During training, test the agent every n episodes.",
|
206 |
)
|
207 |
|
208 |
parser.add_argument(
|
209 |
"--max_steps",
|
210 |
type=int,
|
211 |
default=500,
|
212 |
-
help="The maximum number of steps per episode before the episode is forced to end.",
|
213 |
)
|
214 |
|
215 |
### Agent parameters
|
@@ -217,13 +217,13 @@ def main():
|
|
217 |
"--gamma",
|
218 |
type=float,
|
219 |
default=0.99,
|
220 |
-
help="The value for the discount factor to use.",
|
221 |
)
|
222 |
parser.add_argument(
|
223 |
"--epsilon",
|
224 |
type=float,
|
225 |
default=0.1,
|
226 |
-
help="The value for the epsilon-greedy policy to use.",
|
227 |
)
|
228 |
|
229 |
### Environment parameters
|
@@ -231,19 +231,19 @@ def main():
|
|
231 |
"--env",
|
232 |
type=str,
|
233 |
default="CliffWalking-v0",
|
234 |
-
help="The Gymnasium environment to use.",
|
235 |
)
|
236 |
parser.add_argument(
|
237 |
"--render_mode",
|
238 |
type=str,
|
239 |
default=None,
|
240 |
-
help="
|
241 |
)
|
242 |
parser.add_argument(
|
243 |
"--wandb_project",
|
244 |
type=str,
|
245 |
default=None,
|
246 |
-
help="WandB project name for logging. If not provided, no logging is done.",
|
247 |
)
|
248 |
parser.add_argument(
|
249 |
"--wandb_group",
|
|
|
178 |
parser.add_argument(
|
179 |
"--train",
|
180 |
action="store_true",
|
181 |
+
help="Use this flag to train the agent.",
|
182 |
)
|
183 |
parser.add_argument(
|
184 |
"--test",
|
|
|
190 |
"--n_train_episodes",
|
191 |
type=int,
|
192 |
default=2000,
|
193 |
+
help="The number of episodes to train for. (default: 2000)",
|
194 |
)
|
195 |
parser.add_argument(
|
196 |
"--n_test_episodes",
|
197 |
type=int,
|
198 |
default=100,
|
199 |
+
help="The number of episodes to test for. (default: 100)",
|
200 |
)
|
201 |
parser.add_argument(
|
202 |
"--test_every",
|
203 |
type=int,
|
204 |
default=100,
|
205 |
+
help="During training, test the agent every n episodes. (default: 100)",
|
206 |
)
|
207 |
|
208 |
parser.add_argument(
|
209 |
"--max_steps",
|
210 |
type=int,
|
211 |
default=500,
|
212 |
+
help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
|
213 |
)
|
214 |
|
215 |
### Agent parameters
|
|
|
217 |
"--gamma",
|
218 |
type=float,
|
219 |
default=0.99,
|
220 |
+
help="The value for the discount factor to use. (default: 0.99)",
|
221 |
)
|
222 |
parser.add_argument(
|
223 |
"--epsilon",
|
224 |
type=float,
|
225 |
default=0.1,
|
226 |
+
help="The value for the epsilon-greedy policy to use. (default: 0.1)",
|
227 |
)
|
228 |
|
229 |
### Environment parameters
|
|
|
231 |
"--env",
|
232 |
type=str,
|
233 |
default="CliffWalking-v0",
|
234 |
+
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
235 |
)
|
236 |
parser.add_argument(
|
237 |
"--render_mode",
|
238 |
type=str,
|
239 |
default=None,
|
240 |
+
help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
|
241 |
)
|
242 |
parser.add_argument(
|
243 |
"--wandb_project",
|
244 |
type=str,
|
245 |
default=None,
|
246 |
+
help="WandB project name for logging. If not provided, no logging is done. (default: None)",
|
247 |
)
|
248 |
parser.add_argument(
|
249 |
"--wandb_group",
|
README.md
CHANGED
@@ -4,6 +4,63 @@
|
|
4 |
|
5 |
Evolution of Reinforcement Learning methods from pure Dynamic Programming-based methods to Monte Carlo methods + Bellman Optimization Comparison
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
## Presentation Guide (Text Version)
|
8 |
|
9 |
1. Title Slide: list the title of your talk along with your name
|
|
|
4 |
|
5 |
Evolution of Reinforcement Learning methods from pure Dynamic Programming-based methods to Monte Carlo methods + Bellman Optimization Comparison
|
6 |
|
7 |
+
## Monte-Carlo Agent
|
8 |
+
|
9 |
+
The implementation of the epsilon-greedy Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
|
10 |
+
|
11 |
+
### Training
|
12 |
+
|
13 |
+
```bash
|
14 |
+
python3 MonteCarloAgent.py --train
|
15 |
+
```
|
16 |
+
|
17 |
+
The final policy will be saved to a `.npy` file.
|
18 |
+
|
19 |
+
### Testing
|
20 |
+
|
21 |
+
Provide the path to the policy file as an argument to the `--test` flag.
|
22 |
+
|
23 |
+
```bash
|
24 |
+
python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy
|
25 |
+
```
|
26 |
+
|
27 |
+
### Visualization
|
28 |
+
|
29 |
+
```bash
|
30 |
+
python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
|
31 |
+
```
|
32 |
+
|
33 |
+
### Default Parameters
|
34 |
+
|
35 |
+
```python
|
36 |
+
usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--gamma GAMMA] [--epsilon EPSILON] [--env ENV]
|
37 |
+
[--render_mode RENDER_MODE] [--wandb_project WANDB_PROJECT] [--wandb_group WANDB_GROUP] [--wandb_job_type WANDB_JOB_TYPE]
|
38 |
+
|
39 |
+
options:
|
40 |
+
-h, --help show this help message and exit
|
41 |
+
--train Use this flag to train the agent. (default: False)
|
42 |
+
--test TEST Use this flag to test the agent. Provide the path to the policy file.
|
43 |
+
--n_train_episodes N_TRAIN_EPISODES
|
44 |
+
The number of episodes to train for.
|
45 |
+
--n_test_episodes N_TEST_EPISODES
|
46 |
+
The number of episodes to test for.
|
47 |
+
--test_every TEST_EVERY
|
48 |
+
During training, test the agent every n episodes.
|
49 |
+
--max_steps MAX_STEPS
|
50 |
+
The maximum number of steps per episode before the episode is forced to end.
|
51 |
+
--gamma GAMMA The value for the discount factor to use.
|
52 |
+
--epsilon EPSILON The value for the epsilon-greedy policy to use.
|
53 |
+
--env ENV The Gymnasium environment to use.
|
54 |
+
--render_mode RENDER_MODE
|
55 |
+
The render mode to use. By default, no rendering is done. To render the environment, set this to 'human'.
|
56 |
+
--wandb_project WANDB_PROJECT
|
57 |
+
WandB project name for logging. If not provided, no logging is done.
|
58 |
+
--wandb_group WANDB_GROUP
|
59 |
+
WandB group name for logging. (default: monte-carlo)
|
60 |
+
--wandb_job_type WANDB_JOB_TYPE
|
61 |
+
WandB job type for logging. (default: train)
|
62 |
+
```
|
63 |
+
|
64 |
## Presentation Guide (Text Version)
|
65 |
|
66 |
1. Title Slide: list the title of your talk along with your name
|
policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy
ADDED
Binary file (1.66 kB). View file
|
|