Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
1663f39
1
Parent(s):
3c4eba9
Updates
Browse files- MonteCarloAgent.py +90 -25
- README.md +23 -61
- demo.py +59 -5
- policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy +0 -0
- policies/MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy +0 -0
- run_tests_MC_CliffWalking-v0.py +34 -0
- run_tests.py → run_tests_MC_FrozenLake-v1.py +13 -7
MonteCarloAgent.py
CHANGED
@@ -3,18 +3,36 @@ import numpy as np
|
|
3 |
import gymnasium as gym
|
4 |
from tqdm import tqdm
|
5 |
import argparse
|
6 |
-
|
7 |
import wandb
|
8 |
|
9 |
|
10 |
class MonteCarloAgent:
|
11 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
print("=" * 80)
|
13 |
print(f"# MonteCarloAgent - {env_name}")
|
14 |
print(f"- epsilon: {epsilon}")
|
15 |
print(f"- gamma: {gamma}")
|
16 |
-
|
|
|
|
|
17 |
self.epsilon, self.gamma = epsilon, gamma
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
self.n_states, self.n_actions = (
|
19 |
self.env.observation_space.n,
|
20 |
self.env.action_space.n,
|
@@ -43,17 +61,21 @@ class MonteCarloAgent:
|
|
43 |
print(self.Pi)
|
44 |
print("=" * 80)
|
45 |
|
46 |
-
def choose_action(self, state,
|
47 |
# Sample an action from the policy.
|
48 |
# The override_epsilon argument allows forcing the use of a possibly new self.epsilon value than the one used during training.
|
49 |
# The ability to override was mostly added for testing purposes and for the demo.
|
|
|
50 |
|
51 |
-
if
|
|
|
|
|
|
|
52 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
53 |
|
54 |
return np.random.choice(
|
55 |
-
[
|
56 |
-
p=[1 -
|
57 |
)
|
58 |
|
59 |
def generate_episode(self, max_steps=500, render=False, **kwargs):
|
@@ -61,24 +83,36 @@ class MonteCarloAgent:
|
|
61 |
episode_hist, solved, rgb_array = [], False, None
|
62 |
|
63 |
# Generate an episode following the current policy
|
64 |
-
|
65 |
rgb_array = self.env.render() if render else None
|
|
|
66 |
# Sample an action from the policy
|
67 |
action = self.choose_action(state, **kwargs)
|
68 |
# Take the action and observe the reward and next state
|
69 |
next_state, reward, done, truncated, _ = self.env.step(action)
|
|
|
70 |
# Keeping track of the trajectory
|
71 |
episode_hist.append((state, action, reward))
|
72 |
-
state = next_state
|
73 |
-
|
74 |
yield episode_hist, solved, rgb_array
|
75 |
|
76 |
-
#
|
77 |
-
|
78 |
-
|
|
|
79 |
solved = True
|
80 |
break
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
rgb_array = self.env.render() if render else None
|
83 |
|
84 |
yield episode_hist, solved, rgb_array
|
@@ -135,14 +169,24 @@ class MonteCarloAgent:
|
|
135 |
test_every=100,
|
136 |
update_type="first_visit",
|
137 |
log_wandb=False,
|
|
|
|
|
138 |
**kwargs,
|
139 |
):
|
140 |
print(f"Training agent for {n_train_episodes} episodes...")
|
141 |
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
stats = {
|
144 |
"train_running_success_rate": train_running_success_rate,
|
|
|
145 |
"test_success_rate": test_success_rate,
|
|
|
146 |
}
|
147 |
|
148 |
update_func = getattr(self, f"update_{update_type}")
|
@@ -157,36 +201,52 @@ class MonteCarloAgent:
|
|
157 |
episode_hist, solved, _ = self.run_episode(**kwargs)
|
158 |
rewards = [x[2] for x in episode_hist]
|
159 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
|
|
160 |
train_running_success_rate = (
|
161 |
0.99 * train_running_success_rate + 0.01 * solved
|
162 |
)
|
|
|
|
|
163 |
update_func(episode_hist)
|
164 |
|
165 |
stats = {
|
166 |
"train_running_success_rate": train_running_success_rate,
|
|
|
167 |
"test_success_rate": test_success_rate,
|
|
|
168 |
"total_reward": total_reward,
|
169 |
"avg_reward": avg_reward,
|
170 |
}
|
171 |
tqrange.set_postfix(stats)
|
172 |
|
|
|
173 |
if e % test_every == 0:
|
174 |
test_success_rate = self.test(verbose=False, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
if log_wandb:
|
176 |
self.wandb_log_img(episode=e)
|
177 |
|
|
|
|
|
|
|
|
|
178 |
stats["test_success_rate"] = test_success_rate
|
179 |
tqrange.set_postfix(stats)
|
180 |
|
181 |
if log_wandb:
|
182 |
wandb.log(stats)
|
183 |
|
184 |
-
def test(self, n_test_episodes=100, verbose=True, **kwargs):
|
185 |
if verbose:
|
186 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
187 |
num_successes = 0
|
188 |
for e in range(n_test_episodes):
|
189 |
-
_, solved, _ = self.run_episode(**kwargs)
|
190 |
num_successes += solved
|
191 |
if verbose:
|
192 |
word = "reached" if solved else "did not reach"
|
@@ -247,8 +307,8 @@ def main():
|
|
247 |
parser.add_argument(
|
248 |
"--n_train_episodes",
|
249 |
type=int,
|
250 |
-
default=
|
251 |
-
help="The number of episodes to train for. (default:
|
252 |
)
|
253 |
parser.add_argument(
|
254 |
"--n_test_episodes",
|
@@ -266,8 +326,8 @@ def main():
|
|
266 |
parser.add_argument(
|
267 |
"--max_steps",
|
268 |
type=int,
|
269 |
-
default=
|
270 |
-
help="The maximum number of steps per episode before the episode is forced to end. (default:
|
271 |
)
|
272 |
|
273 |
parser.add_argument(
|
@@ -295,14 +355,14 @@ def main():
|
|
295 |
parser.add_argument(
|
296 |
"--gamma",
|
297 |
type=float,
|
298 |
-
default=0
|
299 |
-
help="The value for the discount factor to use. (default: 0
|
300 |
)
|
301 |
parser.add_argument(
|
302 |
"--epsilon",
|
303 |
type=float,
|
304 |
-
default=0.
|
305 |
-
help="The value for the epsilon-greedy policy to use. (default: 0.
|
306 |
)
|
307 |
|
308 |
### Environment parameters
|
@@ -310,6 +370,7 @@ def main():
|
|
310 |
"--env",
|
311 |
type=str,
|
312 |
default="CliffWalking-v0",
|
|
|
313 |
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
314 |
)
|
315 |
parser.add_argument(
|
@@ -352,10 +413,12 @@ def main():
|
|
352 |
render_mode=args.render_mode,
|
353 |
)
|
354 |
|
355 |
-
run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
|
356 |
if args.wandb_run_name_suffix is not None:
|
357 |
run_name += f"+{args.wandb_run_name_suffix}"
|
358 |
|
|
|
|
|
359 |
try:
|
360 |
if args.train:
|
361 |
# Log to WandB
|
@@ -375,6 +438,8 @@ def main():
|
|
375 |
max_steps=args.max_steps,
|
376 |
update_type=args.update_type,
|
377 |
log_wandb=args.wandb_project is not None,
|
|
|
|
|
378 |
)
|
379 |
if not args.no_save:
|
380 |
agent.save_policy(
|
|
|
3 |
import gymnasium as gym
|
4 |
from tqdm import tqdm
|
5 |
import argparse
|
6 |
+
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
7 |
import wandb
|
8 |
|
9 |
|
10 |
class MonteCarloAgent:
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
env_name="CliffWalking-v0",
|
14 |
+
gamma=0.99,
|
15 |
+
epsilon=0.1,
|
16 |
+
run_name=None,
|
17 |
+
**kwargs,
|
18 |
+
):
|
19 |
print("=" * 80)
|
20 |
print(f"# MonteCarloAgent - {env_name}")
|
21 |
print(f"- epsilon: {epsilon}")
|
22 |
print(f"- gamma: {gamma}")
|
23 |
+
print(f"- run_name: {run_name}")
|
24 |
+
self.run_name = run_name
|
25 |
+
self.env_name = env_name
|
26 |
self.epsilon, self.gamma = epsilon, gamma
|
27 |
+
|
28 |
+
self.env_kwargs = kwargs
|
29 |
+
if self.env_name == "FrozenLake-v1":
|
30 |
+
self.env_kwargs["desc"] = None
|
31 |
+
self.env_kwargs["map_name"] = "4x4"
|
32 |
+
self.env_kwargs["is_slippery"] = "False"
|
33 |
+
|
34 |
+
self.env = gym.make(self.env_name, **self.env_kwargs)
|
35 |
+
|
36 |
self.n_states, self.n_actions = (
|
37 |
self.env.observation_space.n,
|
38 |
self.env.action_space.n,
|
|
|
61 |
print(self.Pi)
|
62 |
print("=" * 80)
|
63 |
|
64 |
+
def choose_action(self, state, epsilon_override=None, greedy=False, **kwargs):
|
65 |
# Sample an action from the policy.
|
66 |
# The override_epsilon argument allows forcing the use of a possibly new self.epsilon value than the one used during training.
|
67 |
# The ability to override was mostly added for testing purposes and for the demo.
|
68 |
+
greedy_action = np.argmax(self.Pi[state])
|
69 |
|
70 |
+
if greedy:
|
71 |
+
return greedy_action
|
72 |
+
|
73 |
+
if epsilon_override is None:
|
74 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
75 |
|
76 |
return np.random.choice(
|
77 |
+
[greedy_action, np.random.randint(self.n_actions)],
|
78 |
+
p=[1 - epsilon_override, epsilon_override],
|
79 |
)
|
80 |
|
81 |
def generate_episode(self, max_steps=500, render=False, **kwargs):
|
|
|
83 |
episode_hist, solved, rgb_array = [], False, None
|
84 |
|
85 |
# Generate an episode following the current policy
|
86 |
+
while len(episode_hist) < max_steps:
|
87 |
rgb_array = self.env.render() if render else None
|
88 |
+
|
89 |
# Sample an action from the policy
|
90 |
action = self.choose_action(state, **kwargs)
|
91 |
# Take the action and observe the reward and next state
|
92 |
next_state, reward, done, truncated, _ = self.env.step(action)
|
93 |
+
|
94 |
# Keeping track of the trajectory
|
95 |
episode_hist.append((state, action, reward))
|
|
|
|
|
96 |
yield episode_hist, solved, rgb_array
|
97 |
|
98 |
+
# For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
|
99 |
+
if done and (
|
100 |
+
self.env_name == "CliffWalking-v0" or self.env_name == "Taxi-v3"
|
101 |
+
):
|
102 |
solved = True
|
103 |
break
|
104 |
|
105 |
+
# For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
|
106 |
+
# We consider the episode solved when the agent reaches the goal (done == True and reward == 1)
|
107 |
+
if done and self.env_name == "FrozenLake-v1" and reward == 1:
|
108 |
+
solved = True
|
109 |
+
break
|
110 |
+
|
111 |
+
if done or truncated:
|
112 |
+
break
|
113 |
+
|
114 |
+
state = next_state
|
115 |
+
|
116 |
rgb_array = self.env.render() if render else None
|
117 |
|
118 |
yield episode_hist, solved, rgb_array
|
|
|
169 |
test_every=100,
|
170 |
update_type="first_visit",
|
171 |
log_wandb=False,
|
172 |
+
save_best=True,
|
173 |
+
save_best_dir=None,
|
174 |
**kwargs,
|
175 |
):
|
176 |
print(f"Training agent for {n_train_episodes} episodes...")
|
177 |
|
178 |
+
(
|
179 |
+
train_running_success_rate,
|
180 |
+
test_success_rate,
|
181 |
+
test_running_success_rate,
|
182 |
+
avg_ep_len,
|
183 |
+
) = (0.0, 0.0, 0.0, 0.0)
|
184 |
+
|
185 |
stats = {
|
186 |
"train_running_success_rate": train_running_success_rate,
|
187 |
+
"test_running_success_rate": test_running_success_rate,
|
188 |
"test_success_rate": test_success_rate,
|
189 |
+
"avg_ep_len": avg_ep_len,
|
190 |
}
|
191 |
|
192 |
update_func = getattr(self, f"update_{update_type}")
|
|
|
201 |
episode_hist, solved, _ = self.run_episode(**kwargs)
|
202 |
rewards = [x[2] for x in episode_hist]
|
203 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
204 |
+
|
205 |
train_running_success_rate = (
|
206 |
0.99 * train_running_success_rate + 0.01 * solved
|
207 |
)
|
208 |
+
avg_ep_len = 0.99 * avg_ep_len + 0.01 * len(episode_hist)
|
209 |
+
|
210 |
update_func(episode_hist)
|
211 |
|
212 |
stats = {
|
213 |
"train_running_success_rate": train_running_success_rate,
|
214 |
+
"test_running_success_rate": test_running_success_rate,
|
215 |
"test_success_rate": test_success_rate,
|
216 |
+
"avg_ep_len": avg_ep_len,
|
217 |
"total_reward": total_reward,
|
218 |
"avg_reward": avg_reward,
|
219 |
}
|
220 |
tqrange.set_postfix(stats)
|
221 |
|
222 |
+
# Test the agent every test_every episodes with the greedy policy (by default)
|
223 |
if e % test_every == 0:
|
224 |
test_success_rate = self.test(verbose=False, **kwargs)
|
225 |
+
if save_best and test_success_rate > 0.9:
|
226 |
+
if self.run_name is None:
|
227 |
+
print(f"Warning: run_name is None, not saving best policy")
|
228 |
+
else:
|
229 |
+
self.save_policy(self.run_name, save_best_dir)
|
230 |
+
|
231 |
if log_wandb:
|
232 |
self.wandb_log_img(episode=e)
|
233 |
|
234 |
+
test_running_success_rate = (
|
235 |
+
0.99 * test_running_success_rate + 0.01 * test_success_rate
|
236 |
+
)
|
237 |
+
stats["test_running_success_rate"] = test_running_success_rate
|
238 |
stats["test_success_rate"] = test_success_rate
|
239 |
tqrange.set_postfix(stats)
|
240 |
|
241 |
if log_wandb:
|
242 |
wandb.log(stats)
|
243 |
|
244 |
+
def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
|
245 |
if verbose:
|
246 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
247 |
num_successes = 0
|
248 |
for e in range(n_test_episodes):
|
249 |
+
_, solved, _ = self.run_episode(greedy=greedy, **kwargs)
|
250 |
num_successes += solved
|
251 |
if verbose:
|
252 |
word = "reached" if solved else "did not reach"
|
|
|
307 |
parser.add_argument(
|
308 |
"--n_train_episodes",
|
309 |
type=int,
|
310 |
+
default=2500,
|
311 |
+
help="The number of episodes to train for. (default: 2500)",
|
312 |
)
|
313 |
parser.add_argument(
|
314 |
"--n_test_episodes",
|
|
|
326 |
parser.add_argument(
|
327 |
"--max_steps",
|
328 |
type=int,
|
329 |
+
default=200,
|
330 |
+
help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
|
331 |
)
|
332 |
|
333 |
parser.add_argument(
|
|
|
355 |
parser.add_argument(
|
356 |
"--gamma",
|
357 |
type=float,
|
358 |
+
default=1.0,
|
359 |
+
help="The value for the discount factor to use. (default: 1.0)",
|
360 |
)
|
361 |
parser.add_argument(
|
362 |
"--epsilon",
|
363 |
type=float,
|
364 |
+
default=0.4,
|
365 |
+
help="The value for the epsilon-greedy policy to use. (default: 0.4)",
|
366 |
)
|
367 |
|
368 |
### Environment parameters
|
|
|
370 |
"--env",
|
371 |
type=str,
|
372 |
default="CliffWalking-v0",
|
373 |
+
choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
|
374 |
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
375 |
)
|
376 |
parser.add_argument(
|
|
|
413 |
render_mode=args.render_mode,
|
414 |
)
|
415 |
|
416 |
+
run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
|
417 |
if args.wandb_run_name_suffix is not None:
|
418 |
run_name += f"+{args.wandb_run_name_suffix}"
|
419 |
|
420 |
+
agent.run_name = run_name
|
421 |
+
|
422 |
try:
|
423 |
if args.train:
|
424 |
# Log to WandB
|
|
|
438 |
max_steps=args.max_steps,
|
439 |
update_type=args.update_type,
|
440 |
log_wandb=args.wandb_project is not None,
|
441 |
+
save_best=True,
|
442 |
+
save_best_dir=args.save_dir,
|
443 |
)
|
444 |
if not args.no_save:
|
445 |
agent.save_policy(
|
README.md
CHANGED
@@ -8,28 +8,43 @@ Evolution of Reinforcement Learning methods from pure Dynamic Programming-based
|
|
8 |
|
9 |
- Python 3
|
10 |
- Gymnasium: <https://pypi.org/project/gymnasium/>
|
11 |
-
- WandB: <https://pypi.org/project/wandb/>
|
12 |
-
- Gradio: <https://pypi.org/project/gradio/>
|
13 |
|
14 |
## Interactive Demo
|
15 |
|
16 |
TODO
|
17 |
|
18 |
-
##
|
|
|
|
|
19 |
|
20 |
TODO
|
21 |
|
22 |
-
|
23 |
|
24 |
```bash
|
25 |
TODO
|
26 |
```
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
```bash
|
35 |
# Training: Policy will be saved as a `.npy` file.
|
@@ -39,7 +54,7 @@ python3 MonteCarloAgent.py --train
|
|
39 |
python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
|
40 |
```
|
41 |
|
42 |
-
|
43 |
|
44 |
```bash
|
45 |
usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}]
|
@@ -76,56 +91,3 @@ options:
|
|
76 |
--wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
|
77 |
WandB run name suffix for logging. (default: None)
|
78 |
```
|
79 |
-
|
80 |
-
## Presentation Guide
|
81 |
-
|
82 |
-
1. Title Slide: list the title of your talk along with your name
|
83 |
-
|
84 |
-
2. Test Questions Slide: provide three questions relevant to your subject
|
85 |
-
|
86 |
-
- short answers should suffice
|
87 |
-
- somewhere during your talk provide the answers, but do not emphasize them
|
88 |
-
|
89 |
-
3. Presenter’s Slides: let others get to know you
|
90 |
-
|
91 |
-
- provide a little information about yourself, your degree program and your advisor
|
92 |
-
- describe your interests and goals; show a map and picture(s) of your hometown
|
93 |
-
- as examples, students frequently like to mention their pets, their travels, their interests in music and food, even their favorite movies, you name it
|
94 |
-
|
95 |
-
4. Outline Slide: provide a bulleted outline of the rest of your talk
|
96 |
-
|
97 |
-
5. Overview Slide: list important definitions and provide a brief mention of applications
|
98 |
-
|
99 |
-
6. History Slide: discuss major contributors, interesting stories and main developments
|
100 |
-
|
101 |
-
7. Algorithms Slides: describe basic procedures and methodological comparisons
|
102 |
-
|
103 |
-
- this should be the main part of your talk
|
104 |
-
- discuss techniques from the most basic to the state-of-the-art
|
105 |
-
- use examples and figures whenever possible
|
106 |
-
|
107 |
-
8. Applications Slides: educate the class about amenable problems of interest to you
|
108 |
-
|
109 |
-
- don’t get bogged down in too much minutiae
|
110 |
-
- once again use examples and figures whenever possible
|
111 |
-
|
112 |
-
9. Implementations Slides: discuss the results of your coding work (if any)
|
113 |
-
|
114 |
-
- compare and contrast the algorithms you implemented
|
115 |
-
- make effective use of table and charts
|
116 |
-
|
117 |
-
10. Open Issues Slide: enumerate and discuss a few open questions
|
118 |
-
|
119 |
-
11. References Slide: provide a handful of key citations
|
120 |
-
|
121 |
-
12. Discussion Slide: solicit questions from the class
|
122 |
-
|
123 |
-
- this slide may have only a few bullets – it may even be left blank
|
124 |
-
- this is a good opportunity for other students to add to the discussion
|
125 |
-
- be ready to prompt some questions if there is silence
|
126 |
-
- remember not to repeat answers to your test questions
|
127 |
-
|
128 |
-
13. Test Questions Slide Revisited: show again your original test questions slide
|
129 |
-
|
130 |
-
- students may now complete their answer sheets and hand them to you
|
131 |
-
- Ashley will supervise as we applaud your excellent presentation!
|
|
|
8 |
|
9 |
- Python 3
|
10 |
- Gymnasium: <https://pypi.org/project/gymnasium/>
|
11 |
+
- WandB: <https://pypi.org/project/wandb/> (for logging)
|
12 |
+
- Gradio: <https://pypi.org/project/gradio/> (for demo web app)
|
13 |
|
14 |
## Interactive Demo
|
15 |
|
16 |
TODO
|
17 |
|
18 |
+
## 2. Agents
|
19 |
+
|
20 |
+
### Dynamic-Programming Agent
|
21 |
|
22 |
TODO
|
23 |
|
24 |
+
**DP Usage:**
|
25 |
|
26 |
```bash
|
27 |
TODO
|
28 |
```
|
29 |
|
30 |
+
### Monte-Carlo Agent
|
31 |
+
|
32 |
+
This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
|
33 |
+
|
34 |
+
The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
|
35 |
+
|
36 |
+
Off-policy methods using importance sampling are not implemented for this project.
|
37 |
|
38 |
+
Parameter testing results:
|
39 |
|
40 |
+
- `run_tests_MC_CliffWalking-v0.sh` (n_train_episodes=2500 and max_steps=200)
|
41 |
+
- Best Update Type: first_visit
|
42 |
+
- Best Gamma: 1.0
|
43 |
+
- Best Epsilon: 0.4
|
44 |
+
- `run_tests_MC_FrozenLake-v1.sh` (n_train_episodes=10000 and max_steps=200)
|
45 |
+
- Best Update Type: first_visit
|
46 |
+
- Best Gamma: 1.0
|
47 |
+
- Best Epsilon: 0.5 (testing) and 0.2 (training)
|
48 |
|
49 |
```bash
|
50 |
# Training: Policy will be saved as a `.npy` file.
|
|
|
54 |
python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
|
55 |
```
|
56 |
|
57 |
+
**MC Usage**
|
58 |
|
59 |
```bash
|
60 |
usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}]
|
|
|
91 |
--wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
|
92 |
WandB run name suffix for logging. (default: None)
|
93 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.py
CHANGED
@@ -31,6 +31,12 @@ action_map = {
|
|
31 |
2: "down",
|
32 |
3: "left",
|
33 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
}
|
35 |
|
36 |
|
@@ -45,6 +51,12 @@ live_render_fps = 5
|
|
45 |
live_epsilon = 0.0
|
46 |
live_paused = True
|
47 |
live_steps_forward = None
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def change_render_fps(x):
|
@@ -77,7 +89,7 @@ def onclick_btn_forward():
|
|
77 |
|
78 |
|
79 |
def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
80 |
-
global live_render_fps, live_epsilon, live_paused, live_steps_forward
|
81 |
live_render_fps = render_fps
|
82 |
live_epsilon = epsilon
|
83 |
live_steps_forward = None
|
@@ -124,7 +136,7 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
|
124 |
for episode in range(n_test_episodes):
|
125 |
for step, (episode_hist, solved, rgb_array) in enumerate(
|
126 |
agent.generate_episode(
|
127 |
-
max_steps=max_steps, render=True,
|
128 |
)
|
129 |
):
|
130 |
_, _, last_reward = (
|
@@ -133,6 +145,30 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
|
133 |
state, action, reward = episode_hist[-1]
|
134 |
curr_policy = agent.Pi[state]
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
viz_w = 512
|
137 |
viz_h = viz_w // len(curr_policy)
|
138 |
policy_viz = np.zeros((viz_h, viz_w))
|
@@ -189,8 +225,6 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
|
189 |
f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
|
190 |
)
|
191 |
|
192 |
-
# Live-update the agent's epsilon value for demonstration purposes
|
193 |
-
agent.epsilon = live_epsilon
|
194 |
yield agent_type, env_name, rgb_array, policy_viz, ep_str(
|
195 |
episode + 1
|
196 |
), ep_str(episodes_solved), step_str(
|
@@ -214,6 +248,24 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
|
214 |
step
|
215 |
), state, action, last_reward, "Paused..."
|
216 |
time.sleep(1 / live_render_fps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
if solved:
|
219 |
episodes_solved += 1
|
@@ -318,6 +370,8 @@ with gr.Blocks(title="CS581 Demo") as demo:
|
|
318 |
label="Status Message",
|
319 |
)
|
320 |
|
|
|
|
|
321 |
btn_run.click(
|
322 |
fn=run,
|
323 |
inputs=[
|
@@ -342,5 +396,5 @@ with gr.Blocks(title="CS581 Demo") as demo:
|
|
342 |
],
|
343 |
)
|
344 |
|
345 |
-
demo.queue(concurrency_count=
|
346 |
demo.launch()
|
|
|
31 |
2: "down",
|
32 |
3: "left",
|
33 |
},
|
34 |
+
"FrozenLake-v1": {
|
35 |
+
0: "left",
|
36 |
+
1: "down",
|
37 |
+
2: "right",
|
38 |
+
3: "up",
|
39 |
+
},
|
40 |
}
|
41 |
|
42 |
|
|
|
51 |
live_epsilon = 0.0
|
52 |
live_paused = True
|
53 |
live_steps_forward = None
|
54 |
+
should_reset = False
|
55 |
+
|
56 |
+
|
57 |
+
# def reset():
|
58 |
+
# global should_reset
|
59 |
+
# should_reset = True
|
60 |
|
61 |
|
62 |
def change_render_fps(x):
|
|
|
89 |
|
90 |
|
91 |
def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
92 |
+
global live_render_fps, live_epsilon, live_paused, live_steps_forward, should_reset
|
93 |
live_render_fps = render_fps
|
94 |
live_epsilon = epsilon
|
95 |
live_steps_forward = None
|
|
|
136 |
for episode in range(n_test_episodes):
|
137 |
for step, (episode_hist, solved, rgb_array) in enumerate(
|
138 |
agent.generate_episode(
|
139 |
+
max_steps=max_steps, render=True, epsilon_override=live_epsilon
|
140 |
)
|
141 |
):
|
142 |
_, _, last_reward = (
|
|
|
145 |
state, action, reward = episode_hist[-1]
|
146 |
curr_policy = agent.Pi[state]
|
147 |
|
148 |
+
rgb_array_height, rgb_array_width = 128, 512
|
149 |
+
rgb_array = cv2.resize(
|
150 |
+
rgb_array,
|
151 |
+
(
|
152 |
+
int(rgb_array.shape[1] / rgb_array.shape[0] * rgb_array_height),
|
153 |
+
rgb_array_height,
|
154 |
+
),
|
155 |
+
interpolation=cv2.INTER_AREA,
|
156 |
+
)
|
157 |
+
rgb_array_new = np.pad(
|
158 |
+
rgb_array,
|
159 |
+
(
|
160 |
+
(0, 0),
|
161 |
+
(
|
162 |
+
(rgb_array_width - rgb_array.shape[1]) // 2,
|
163 |
+
(rgb_array_width - rgb_array.shape[1]) // 2,
|
164 |
+
),
|
165 |
+
(0, 0),
|
166 |
+
),
|
167 |
+
"constant",
|
168 |
+
)
|
169 |
+
|
170 |
+
rgb_array = np.uint8(rgb_array_new)
|
171 |
+
|
172 |
viz_w = 512
|
173 |
viz_h = viz_w // len(curr_policy)
|
174 |
policy_viz = np.zeros((viz_h, viz_w))
|
|
|
225 |
f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
|
226 |
)
|
227 |
|
|
|
|
|
228 |
yield agent_type, env_name, rgb_array, policy_viz, ep_str(
|
229 |
episode + 1
|
230 |
), ep_str(episodes_solved), step_str(
|
|
|
248 |
step
|
249 |
), state, action, last_reward, "Paused..."
|
250 |
time.sleep(1 / live_render_fps)
|
251 |
+
# if should_reset is True:
|
252 |
+
# break
|
253 |
+
|
254 |
+
# if should_reset is True:
|
255 |
+
# should_reset = False
|
256 |
+
# return (
|
257 |
+
# agent_type,
|
258 |
+
# env_name,
|
259 |
+
# rgb_array,
|
260 |
+
# policy_viz,
|
261 |
+
# ep_str(episode + 1),
|
262 |
+
# ep_str(episodes_solved),
|
263 |
+
# step_str(step),
|
264 |
+
# state,
|
265 |
+
# action,
|
266 |
+
# last_reward,
|
267 |
+
# "Resetting...",
|
268 |
+
# )
|
269 |
|
270 |
if solved:
|
271 |
episodes_solved += 1
|
|
|
370 |
label="Status Message",
|
371 |
)
|
372 |
|
373 |
+
# input_policy.change(fn=reset)
|
374 |
+
|
375 |
btn_run.click(
|
376 |
fn=run,
|
377 |
inputs=[
|
|
|
396 |
],
|
397 |
)
|
398 |
|
399 |
+
demo.queue(concurrency_count=3)
|
400 |
demo.launch()
|
policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy
DELETED
Binary file (1.66 kB)
|
|
policies/MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy
ADDED
Binary file (1.66 kB). View file
|
|
run_tests_MC_CliffWalking-v0.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import multiprocessing
|
3 |
+
import random
|
4 |
+
|
5 |
+
wandb_project = "cs581"
|
6 |
+
|
7 |
+
env = "CliffWalking-v0"
|
8 |
+
n_train_episodes = 2500
|
9 |
+
max_steps = 200
|
10 |
+
|
11 |
+
num_tests = 10
|
12 |
+
|
13 |
+
vals_update_type = [
|
14 |
+
"first_visit"
|
15 |
+
] # Every visit takes too long due to this environment's reward structure
|
16 |
+
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
17 |
+
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
18 |
+
|
19 |
+
|
20 |
+
def run_test(args):
|
21 |
+
os.system(
|
22 |
+
f"python3 MonteCarloAgent.py --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
with multiprocessing.Pool(16) as p:
|
27 |
+
tests = []
|
28 |
+
for update_type in vals_update_type:
|
29 |
+
for gamma in vals_gamma:
|
30 |
+
for eps in vals_epsilon:
|
31 |
+
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
32 |
+
random.shuffle(tests)
|
33 |
+
|
34 |
+
p.map(run_test, tests)
|
run_tests.py → run_tests_MC_FrozenLake-v1.py
RENAMED
@@ -2,24 +2,30 @@ import os
|
|
2 |
import multiprocessing
|
3 |
import random
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
num_tests = 10
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
vals_gamma = [1.0, 0.
|
10 |
|
11 |
|
12 |
def run_test(args):
|
13 |
os.system(
|
14 |
-
f"python3 MonteCarloAgent.py --train
|
15 |
)
|
16 |
|
17 |
|
18 |
-
with multiprocessing.Pool(
|
19 |
tests = []
|
20 |
-
for update_type in
|
21 |
for gamma in vals_gamma:
|
22 |
-
for eps in
|
23 |
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
24 |
random.shuffle(tests)
|
25 |
|
|
|
2 |
import multiprocessing
|
3 |
import random
|
4 |
|
5 |
+
wandb_project = "cs581"
|
6 |
+
|
7 |
+
env = "FrozenLake-v1"
|
8 |
+
n_train_episodes = 5000
|
9 |
+
max_steps = 200
|
10 |
+
|
11 |
num_tests = 10
|
12 |
|
13 |
+
vals_update_type = ["first_visit", "every_visit"]
|
14 |
+
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
15 |
+
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
16 |
|
17 |
|
18 |
def run_test(args):
|
19 |
os.system(
|
20 |
+
f"python3 MonteCarloAgent.py --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
|
21 |
)
|
22 |
|
23 |
|
24 |
+
with multiprocessing.Pool(16) as p:
|
25 |
tests = []
|
26 |
+
for update_type in vals_update_type:
|
27 |
for gamma in vals_gamma:
|
28 |
+
for eps in vals_epsilon:
|
29 |
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
30 |
random.shuffle(tests)
|
31 |
|