Andrei Cozma commited on
Commit
1663f39
·
1 Parent(s): 3c4eba9
MonteCarloAgent.py CHANGED
@@ -3,18 +3,36 @@ import numpy as np
3
  import gymnasium as gym
4
  from tqdm import tqdm
5
  import argparse
6
-
7
  import wandb
8
 
9
 
10
  class MonteCarloAgent:
11
- def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
 
 
 
 
 
 
 
12
  print("=" * 80)
13
  print(f"# MonteCarloAgent - {env_name}")
14
  print(f"- epsilon: {epsilon}")
15
  print(f"- gamma: {gamma}")
16
- self.env = gym.make(env_name, **kwargs)
 
 
17
  self.epsilon, self.gamma = epsilon, gamma
 
 
 
 
 
 
 
 
 
18
  self.n_states, self.n_actions = (
19
  self.env.observation_space.n,
20
  self.env.action_space.n,
@@ -43,17 +61,21 @@ class MonteCarloAgent:
43
  print(self.Pi)
44
  print("=" * 80)
45
 
46
- def choose_action(self, state, override_epsilon=False, **kwargs):
47
  # Sample an action from the policy.
48
  # The override_epsilon argument allows forcing the use of a possibly new self.epsilon value than the one used during training.
49
  # The ability to override was mostly added for testing purposes and for the demo.
 
50
 
51
- if override_epsilon is False:
 
 
 
52
  return np.random.choice(self.n_actions, p=self.Pi[state])
53
 
54
  return np.random.choice(
55
- [np.argmax(self.Pi[state]), np.random.randint(self.n_actions)],
56
- p=[1 - self.epsilon, self.epsilon],
57
  )
58
 
59
  def generate_episode(self, max_steps=500, render=False, **kwargs):
@@ -61,24 +83,36 @@ class MonteCarloAgent:
61
  episode_hist, solved, rgb_array = [], False, None
62
 
63
  # Generate an episode following the current policy
64
- for _ in range(max_steps):
65
  rgb_array = self.env.render() if render else None
 
66
  # Sample an action from the policy
67
  action = self.choose_action(state, **kwargs)
68
  # Take the action and observe the reward and next state
69
  next_state, reward, done, truncated, _ = self.env.step(action)
 
70
  # Keeping track of the trajectory
71
  episode_hist.append((state, action, reward))
72
- state = next_state
73
-
74
  yield episode_hist, solved, rgb_array
75
 
76
- # This is where the agent got to the goal.
77
- # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
78
- if done or truncated:
 
79
  solved = True
80
  break
81
 
 
 
 
 
 
 
 
 
 
 
 
82
  rgb_array = self.env.render() if render else None
83
 
84
  yield episode_hist, solved, rgb_array
@@ -135,14 +169,24 @@ class MonteCarloAgent:
135
  test_every=100,
136
  update_type="first_visit",
137
  log_wandb=False,
 
 
138
  **kwargs,
139
  ):
140
  print(f"Training agent for {n_train_episodes} episodes...")
141
 
142
- train_running_success_rate, test_success_rate = 0.0, 0.0
 
 
 
 
 
 
143
  stats = {
144
  "train_running_success_rate": train_running_success_rate,
 
145
  "test_success_rate": test_success_rate,
 
146
  }
147
 
148
  update_func = getattr(self, f"update_{update_type}")
@@ -157,36 +201,52 @@ class MonteCarloAgent:
157
  episode_hist, solved, _ = self.run_episode(**kwargs)
158
  rewards = [x[2] for x in episode_hist]
159
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
 
160
  train_running_success_rate = (
161
  0.99 * train_running_success_rate + 0.01 * solved
162
  )
 
 
163
  update_func(episode_hist)
164
 
165
  stats = {
166
  "train_running_success_rate": train_running_success_rate,
 
167
  "test_success_rate": test_success_rate,
 
168
  "total_reward": total_reward,
169
  "avg_reward": avg_reward,
170
  }
171
  tqrange.set_postfix(stats)
172
 
 
173
  if e % test_every == 0:
174
  test_success_rate = self.test(verbose=False, **kwargs)
 
 
 
 
 
 
175
  if log_wandb:
176
  self.wandb_log_img(episode=e)
177
 
 
 
 
 
178
  stats["test_success_rate"] = test_success_rate
179
  tqrange.set_postfix(stats)
180
 
181
  if log_wandb:
182
  wandb.log(stats)
183
 
184
- def test(self, n_test_episodes=100, verbose=True, **kwargs):
185
  if verbose:
186
  print(f"Testing agent for {n_test_episodes} episodes...")
187
  num_successes = 0
188
  for e in range(n_test_episodes):
189
- _, solved, _ = self.run_episode(**kwargs)
190
  num_successes += solved
191
  if verbose:
192
  word = "reached" if solved else "did not reach"
@@ -247,8 +307,8 @@ def main():
247
  parser.add_argument(
248
  "--n_train_episodes",
249
  type=int,
250
- default=2000,
251
- help="The number of episodes to train for. (default: 2000)",
252
  )
253
  parser.add_argument(
254
  "--n_test_episodes",
@@ -266,8 +326,8 @@ def main():
266
  parser.add_argument(
267
  "--max_steps",
268
  type=int,
269
- default=500,
270
- help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
271
  )
272
 
273
  parser.add_argument(
@@ -295,14 +355,14 @@ def main():
295
  parser.add_argument(
296
  "--gamma",
297
  type=float,
298
- default=0.99,
299
- help="The value for the discount factor to use. (default: 0.99)",
300
  )
301
  parser.add_argument(
302
  "--epsilon",
303
  type=float,
304
- default=0.1,
305
- help="The value for the epsilon-greedy policy to use. (default: 0.1)",
306
  )
307
 
308
  ### Environment parameters
@@ -310,6 +370,7 @@ def main():
310
  "--env",
311
  type=str,
312
  default="CliffWalking-v0",
 
313
  help="The Gymnasium environment to use. (default: CliffWalking-v0)",
314
  )
315
  parser.add_argument(
@@ -352,10 +413,12 @@ def main():
352
  render_mode=args.render_mode,
353
  )
354
 
355
- run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
356
  if args.wandb_run_name_suffix is not None:
357
  run_name += f"+{args.wandb_run_name_suffix}"
358
 
 
 
359
  try:
360
  if args.train:
361
  # Log to WandB
@@ -375,6 +438,8 @@ def main():
375
  max_steps=args.max_steps,
376
  update_type=args.update_type,
377
  log_wandb=args.wandb_project is not None,
 
 
378
  )
379
  if not args.no_save:
380
  agent.save_policy(
 
3
  import gymnasium as gym
4
  from tqdm import tqdm
5
  import argparse
6
+ from gymnasium.envs.toy_text.frozen_lake import generate_random_map
7
  import wandb
8
 
9
 
10
  class MonteCarloAgent:
11
+ def __init__(
12
+ self,
13
+ env_name="CliffWalking-v0",
14
+ gamma=0.99,
15
+ epsilon=0.1,
16
+ run_name=None,
17
+ **kwargs,
18
+ ):
19
  print("=" * 80)
20
  print(f"# MonteCarloAgent - {env_name}")
21
  print(f"- epsilon: {epsilon}")
22
  print(f"- gamma: {gamma}")
23
+ print(f"- run_name: {run_name}")
24
+ self.run_name = run_name
25
+ self.env_name = env_name
26
  self.epsilon, self.gamma = epsilon, gamma
27
+
28
+ self.env_kwargs = kwargs
29
+ if self.env_name == "FrozenLake-v1":
30
+ self.env_kwargs["desc"] = None
31
+ self.env_kwargs["map_name"] = "4x4"
32
+ self.env_kwargs["is_slippery"] = "False"
33
+
34
+ self.env = gym.make(self.env_name, **self.env_kwargs)
35
+
36
  self.n_states, self.n_actions = (
37
  self.env.observation_space.n,
38
  self.env.action_space.n,
 
61
  print(self.Pi)
62
  print("=" * 80)
63
 
64
+ def choose_action(self, state, epsilon_override=None, greedy=False, **kwargs):
65
  # Sample an action from the policy.
66
  # The override_epsilon argument allows forcing the use of a possibly new self.epsilon value than the one used during training.
67
  # The ability to override was mostly added for testing purposes and for the demo.
68
+ greedy_action = np.argmax(self.Pi[state])
69
 
70
+ if greedy:
71
+ return greedy_action
72
+
73
+ if epsilon_override is None:
74
  return np.random.choice(self.n_actions, p=self.Pi[state])
75
 
76
  return np.random.choice(
77
+ [greedy_action, np.random.randint(self.n_actions)],
78
+ p=[1 - epsilon_override, epsilon_override],
79
  )
80
 
81
  def generate_episode(self, max_steps=500, render=False, **kwargs):
 
83
  episode_hist, solved, rgb_array = [], False, None
84
 
85
  # Generate an episode following the current policy
86
+ while len(episode_hist) < max_steps:
87
  rgb_array = self.env.render() if render else None
88
+
89
  # Sample an action from the policy
90
  action = self.choose_action(state, **kwargs)
91
  # Take the action and observe the reward and next state
92
  next_state, reward, done, truncated, _ = self.env.step(action)
93
+
94
  # Keeping track of the trajectory
95
  episode_hist.append((state, action, reward))
 
 
96
  yield episode_hist, solved, rgb_array
97
 
98
+ # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
99
+ if done and (
100
+ self.env_name == "CliffWalking-v0" or self.env_name == "Taxi-v3"
101
+ ):
102
  solved = True
103
  break
104
 
105
+ # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
106
+ # We consider the episode solved when the agent reaches the goal (done == True and reward == 1)
107
+ if done and self.env_name == "FrozenLake-v1" and reward == 1:
108
+ solved = True
109
+ break
110
+
111
+ if done or truncated:
112
+ break
113
+
114
+ state = next_state
115
+
116
  rgb_array = self.env.render() if render else None
117
 
118
  yield episode_hist, solved, rgb_array
 
169
  test_every=100,
170
  update_type="first_visit",
171
  log_wandb=False,
172
+ save_best=True,
173
+ save_best_dir=None,
174
  **kwargs,
175
  ):
176
  print(f"Training agent for {n_train_episodes} episodes...")
177
 
178
+ (
179
+ train_running_success_rate,
180
+ test_success_rate,
181
+ test_running_success_rate,
182
+ avg_ep_len,
183
+ ) = (0.0, 0.0, 0.0, 0.0)
184
+
185
  stats = {
186
  "train_running_success_rate": train_running_success_rate,
187
+ "test_running_success_rate": test_running_success_rate,
188
  "test_success_rate": test_success_rate,
189
+ "avg_ep_len": avg_ep_len,
190
  }
191
 
192
  update_func = getattr(self, f"update_{update_type}")
 
201
  episode_hist, solved, _ = self.run_episode(**kwargs)
202
  rewards = [x[2] for x in episode_hist]
203
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
204
+
205
  train_running_success_rate = (
206
  0.99 * train_running_success_rate + 0.01 * solved
207
  )
208
+ avg_ep_len = 0.99 * avg_ep_len + 0.01 * len(episode_hist)
209
+
210
  update_func(episode_hist)
211
 
212
  stats = {
213
  "train_running_success_rate": train_running_success_rate,
214
+ "test_running_success_rate": test_running_success_rate,
215
  "test_success_rate": test_success_rate,
216
+ "avg_ep_len": avg_ep_len,
217
  "total_reward": total_reward,
218
  "avg_reward": avg_reward,
219
  }
220
  tqrange.set_postfix(stats)
221
 
222
+ # Test the agent every test_every episodes with the greedy policy (by default)
223
  if e % test_every == 0:
224
  test_success_rate = self.test(verbose=False, **kwargs)
225
+ if save_best and test_success_rate > 0.9:
226
+ if self.run_name is None:
227
+ print(f"Warning: run_name is None, not saving best policy")
228
+ else:
229
+ self.save_policy(self.run_name, save_best_dir)
230
+
231
  if log_wandb:
232
  self.wandb_log_img(episode=e)
233
 
234
+ test_running_success_rate = (
235
+ 0.99 * test_running_success_rate + 0.01 * test_success_rate
236
+ )
237
+ stats["test_running_success_rate"] = test_running_success_rate
238
  stats["test_success_rate"] = test_success_rate
239
  tqrange.set_postfix(stats)
240
 
241
  if log_wandb:
242
  wandb.log(stats)
243
 
244
+ def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
245
  if verbose:
246
  print(f"Testing agent for {n_test_episodes} episodes...")
247
  num_successes = 0
248
  for e in range(n_test_episodes):
249
+ _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
250
  num_successes += solved
251
  if verbose:
252
  word = "reached" if solved else "did not reach"
 
307
  parser.add_argument(
308
  "--n_train_episodes",
309
  type=int,
310
+ default=2500,
311
+ help="The number of episodes to train for. (default: 2500)",
312
  )
313
  parser.add_argument(
314
  "--n_test_episodes",
 
326
  parser.add_argument(
327
  "--max_steps",
328
  type=int,
329
+ default=200,
330
+ help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
331
  )
332
 
333
  parser.add_argument(
 
355
  parser.add_argument(
356
  "--gamma",
357
  type=float,
358
+ default=1.0,
359
+ help="The value for the discount factor to use. (default: 1.0)",
360
  )
361
  parser.add_argument(
362
  "--epsilon",
363
  type=float,
364
+ default=0.4,
365
+ help="The value for the epsilon-greedy policy to use. (default: 0.4)",
366
  )
367
 
368
  ### Environment parameters
 
370
  "--env",
371
  type=str,
372
  default="CliffWalking-v0",
373
+ choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
374
  help="The Gymnasium environment to use. (default: CliffWalking-v0)",
375
  )
376
  parser.add_argument(
 
413
  render_mode=args.render_mode,
414
  )
415
 
416
+ run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
417
  if args.wandb_run_name_suffix is not None:
418
  run_name += f"+{args.wandb_run_name_suffix}"
419
 
420
+ agent.run_name = run_name
421
+
422
  try:
423
  if args.train:
424
  # Log to WandB
 
438
  max_steps=args.max_steps,
439
  update_type=args.update_type,
440
  log_wandb=args.wandb_project is not None,
441
+ save_best=True,
442
+ save_best_dir=args.save_dir,
443
  )
444
  if not args.no_save:
445
  agent.save_policy(
README.md CHANGED
@@ -8,28 +8,43 @@ Evolution of Reinforcement Learning methods from pure Dynamic Programming-based
8
 
9
  - Python 3
10
  - Gymnasium: <https://pypi.org/project/gymnasium/>
11
- - WandB: <https://pypi.org/project/wandb/>
12
- - Gradio: <https://pypi.org/project/gradio/>
13
 
14
  ## Interactive Demo
15
 
16
  TODO
17
 
18
- ## Dynamic-Programming Agent
 
 
19
 
20
  TODO
21
 
22
- ### Usage
23
 
24
  ```bash
25
  TODO
26
  ```
27
 
28
- ## Monte-Carlo Agent
 
 
 
 
 
 
29
 
30
- The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy.
31
 
32
- Primarily tested on the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
 
 
 
 
 
 
 
33
 
34
  ```bash
35
  # Training: Policy will be saved as a `.npy` file.
@@ -39,7 +54,7 @@ python3 MonteCarloAgent.py --train
39
  python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
40
  ```
41
 
42
- ### Usage
43
 
44
  ```bash
45
  usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}]
@@ -76,56 +91,3 @@ options:
76
  --wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
77
  WandB run name suffix for logging. (default: None)
78
  ```
79
-
80
- ## Presentation Guide
81
-
82
- 1. Title Slide: list the title of your talk along with your name
83
-
84
- 2. Test Questions Slide: provide three questions relevant to your subject
85
-
86
- - short answers should suffice
87
- - somewhere during your talk provide the answers, but do not emphasize them
88
-
89
- 3. Presenter’s Slides: let others get to know you
90
-
91
- - provide a little information about yourself, your degree program and your advisor
92
- - describe your interests and goals; show a map and picture(s) of your hometown
93
- - as examples, students frequently like to mention their pets, their travels, their interests in music and food, even their favorite movies, you name it
94
-
95
- 4. Outline Slide: provide a bulleted outline of the rest of your talk
96
-
97
- 5. Overview Slide: list important definitions and provide a brief mention of applications
98
-
99
- 6. History Slide: discuss major contributors, interesting stories and main developments
100
-
101
- 7. Algorithms Slides: describe basic procedures and methodological comparisons
102
-
103
- - this should be the main part of your talk
104
- - discuss techniques from the most basic to the state-of-the-art
105
- - use examples and figures whenever possible
106
-
107
- 8. Applications Slides: educate the class about amenable problems of interest to you
108
-
109
- - don’t get bogged down in too much minutiae
110
- - once again use examples and figures whenever possible
111
-
112
- 9. Implementations Slides: discuss the results of your coding work (if any)
113
-
114
- - compare and contrast the algorithms you implemented
115
- - make effective use of table and charts
116
-
117
- 10. Open Issues Slide: enumerate and discuss a few open questions
118
-
119
- 11. References Slide: provide a handful of key citations
120
-
121
- 12. Discussion Slide: solicit questions from the class
122
-
123
- - this slide may have only a few bullets – it may even be left blank
124
- - this is a good opportunity for other students to add to the discussion
125
- - be ready to prompt some questions if there is silence
126
- - remember not to repeat answers to your test questions
127
-
128
- 13. Test Questions Slide Revisited: show again your original test questions slide
129
-
130
- - students may now complete their answer sheets and hand them to you
131
- - Ashley will supervise as we applaud your excellent presentation!
 
8
 
9
  - Python 3
10
  - Gymnasium: <https://pypi.org/project/gymnasium/>
11
+ - WandB: <https://pypi.org/project/wandb/> (for logging)
12
+ - Gradio: <https://pypi.org/project/gradio/> (for demo web app)
13
 
14
  ## Interactive Demo
15
 
16
  TODO
17
 
18
+ ## 2. Agents
19
+
20
+ ### Dynamic-Programming Agent
21
 
22
  TODO
23
 
24
+ **DP Usage:**
25
 
26
  ```bash
27
  TODO
28
  ```
29
 
30
+ ### Monte-Carlo Agent
31
+
32
+ This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
33
+
34
+ The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
35
+
36
+ Off-policy methods using importance sampling are not implemented for this project.
37
 
38
+ Parameter testing results:
39
 
40
+ - `run_tests_MC_CliffWalking-v0.sh` (n_train_episodes=2500 and max_steps=200)
41
+ - Best Update Type: first_visit
42
+ - Best Gamma: 1.0
43
+ - Best Epsilon: 0.4
44
+ - `run_tests_MC_FrozenLake-v1.sh` (n_train_episodes=10000 and max_steps=200)
45
+ - Best Update Type: first_visit
46
+ - Best Gamma: 1.0
47
+ - Best Epsilon: 0.5 (testing) and 0.2 (training)
48
 
49
  ```bash
50
  # Training: Policy will be saved as a `.npy` file.
 
54
  python3 MonteCarloAgent.py --test policy_mc_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy --render_mode human
55
  ```
56
 
57
+ **MC Usage**
58
 
59
  ```bash
60
  usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}]
 
91
  --wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
92
  WandB run name suffix for logging. (default: None)
93
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo.py CHANGED
@@ -31,6 +31,12 @@ action_map = {
31
  2: "down",
32
  3: "left",
33
  },
 
 
 
 
 
 
34
  }
35
 
36
 
@@ -45,6 +51,12 @@ live_render_fps = 5
45
  live_epsilon = 0.0
46
  live_paused = True
47
  live_steps_forward = None
 
 
 
 
 
 
48
 
49
 
50
  def change_render_fps(x):
@@ -77,7 +89,7 @@ def onclick_btn_forward():
77
 
78
 
79
  def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
80
- global live_render_fps, live_epsilon, live_paused, live_steps_forward
81
  live_render_fps = render_fps
82
  live_epsilon = epsilon
83
  live_steps_forward = None
@@ -124,7 +136,7 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
124
  for episode in range(n_test_episodes):
125
  for step, (episode_hist, solved, rgb_array) in enumerate(
126
  agent.generate_episode(
127
- max_steps=max_steps, render=True, override_epsilon=True
128
  )
129
  ):
130
  _, _, last_reward = (
@@ -133,6 +145,30 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
133
  state, action, reward = episode_hist[-1]
134
  curr_policy = agent.Pi[state]
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  viz_w = 512
137
  viz_h = viz_w // len(curr_policy)
138
  policy_viz = np.zeros((viz_h, viz_w))
@@ -189,8 +225,6 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
189
  f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
190
  )
191
 
192
- # Live-update the agent's epsilon value for demonstration purposes
193
- agent.epsilon = live_epsilon
194
  yield agent_type, env_name, rgb_array, policy_viz, ep_str(
195
  episode + 1
196
  ), ep_str(episodes_solved), step_str(
@@ -214,6 +248,24 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
214
  step
215
  ), state, action, last_reward, "Paused..."
216
  time.sleep(1 / live_render_fps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  if solved:
219
  episodes_solved += 1
@@ -318,6 +370,8 @@ with gr.Blocks(title="CS581 Demo") as demo:
318
  label="Status Message",
319
  )
320
 
 
 
321
  btn_run.click(
322
  fn=run,
323
  inputs=[
@@ -342,5 +396,5 @@ with gr.Blocks(title="CS581 Demo") as demo:
342
  ],
343
  )
344
 
345
- demo.queue(concurrency_count=2)
346
  demo.launch()
 
31
  2: "down",
32
  3: "left",
33
  },
34
+ "FrozenLake-v1": {
35
+ 0: "left",
36
+ 1: "down",
37
+ 2: "right",
38
+ 3: "up",
39
+ },
40
  }
41
 
42
 
 
51
  live_epsilon = 0.0
52
  live_paused = True
53
  live_steps_forward = None
54
+ should_reset = False
55
+
56
+
57
+ # def reset():
58
+ # global should_reset
59
+ # should_reset = True
60
 
61
 
62
  def change_render_fps(x):
 
89
 
90
 
91
  def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
92
+ global live_render_fps, live_epsilon, live_paused, live_steps_forward, should_reset
93
  live_render_fps = render_fps
94
  live_epsilon = epsilon
95
  live_steps_forward = None
 
136
  for episode in range(n_test_episodes):
137
  for step, (episode_hist, solved, rgb_array) in enumerate(
138
  agent.generate_episode(
139
+ max_steps=max_steps, render=True, epsilon_override=live_epsilon
140
  )
141
  ):
142
  _, _, last_reward = (
 
145
  state, action, reward = episode_hist[-1]
146
  curr_policy = agent.Pi[state]
147
 
148
+ rgb_array_height, rgb_array_width = 128, 512
149
+ rgb_array = cv2.resize(
150
+ rgb_array,
151
+ (
152
+ int(rgb_array.shape[1] / rgb_array.shape[0] * rgb_array_height),
153
+ rgb_array_height,
154
+ ),
155
+ interpolation=cv2.INTER_AREA,
156
+ )
157
+ rgb_array_new = np.pad(
158
+ rgb_array,
159
+ (
160
+ (0, 0),
161
+ (
162
+ (rgb_array_width - rgb_array.shape[1]) // 2,
163
+ (rgb_array_width - rgb_array.shape[1]) // 2,
164
+ ),
165
+ (0, 0),
166
+ ),
167
+ "constant",
168
+ )
169
+
170
+ rgb_array = np.uint8(rgb_array_new)
171
+
172
  viz_w = 512
173
  viz_h = viz_w // len(curr_policy)
174
  policy_viz = np.zeros((viz_h, viz_w))
 
225
  f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
226
  )
227
 
 
 
228
  yield agent_type, env_name, rgb_array, policy_viz, ep_str(
229
  episode + 1
230
  ), ep_str(episodes_solved), step_str(
 
248
  step
249
  ), state, action, last_reward, "Paused..."
250
  time.sleep(1 / live_render_fps)
251
+ # if should_reset is True:
252
+ # break
253
+
254
+ # if should_reset is True:
255
+ # should_reset = False
256
+ # return (
257
+ # agent_type,
258
+ # env_name,
259
+ # rgb_array,
260
+ # policy_viz,
261
+ # ep_str(episode + 1),
262
+ # ep_str(episodes_solved),
263
+ # step_str(step),
264
+ # state,
265
+ # action,
266
+ # last_reward,
267
+ # "Resetting...",
268
+ # )
269
 
270
  if solved:
271
  episodes_solved += 1
 
370
  label="Status Message",
371
  )
372
 
373
+ # input_policy.change(fn=reset)
374
+
375
  btn_run.click(
376
  fn=run,
377
  inputs=[
 
396
  ],
397
  )
398
 
399
+ demo.queue(concurrency_count=3)
400
  demo.launch()
policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.1.npy DELETED
Binary file (1.66 kB)
 
policies/MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy ADDED
Binary file (1.66 kB). View file
 
run_tests_MC_CliffWalking-v0.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import multiprocessing
3
+ import random
4
+
5
+ wandb_project = "cs581"
6
+
7
+ env = "CliffWalking-v0"
8
+ n_train_episodes = 2500
9
+ max_steps = 200
10
+
11
+ num_tests = 10
12
+
13
+ vals_update_type = [
14
+ "first_visit"
15
+ ] # Every visit takes too long due to this environment's reward structure
16
+ vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
17
+ vals_gamma = [1.0, 0.98, 0.96, 0.94]
18
+
19
+
20
+ def run_test(args):
21
+ os.system(
22
+ f"python3 MonteCarloAgent.py --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
23
+ )
24
+
25
+
26
+ with multiprocessing.Pool(16) as p:
27
+ tests = []
28
+ for update_type in vals_update_type:
29
+ for gamma in vals_gamma:
30
+ for eps in vals_epsilon:
31
+ tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
32
+ random.shuffle(tests)
33
+
34
+ p.map(run_test, tests)
run_tests.py → run_tests_MC_FrozenLake-v1.py RENAMED
@@ -2,24 +2,30 @@ import os
2
  import multiprocessing
3
  import random
4
 
 
 
 
 
 
 
5
  num_tests = 10
6
 
7
- update_types = ["first_visit", "every_visit"]
8
- vals_eps = [0.1, 0.2, 0.3, 0.4, 0.5]
9
- vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.96, 0.95]
10
 
11
 
12
  def run_test(args):
13
  os.system(
14
- f"python3 MonteCarloAgent.py --train --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project cs581 --wandb_job_type params --wandb_run_name_suffix {args[3]} --no_save"
15
  )
16
 
17
 
18
- with multiprocessing.Pool(12) as p:
19
  tests = []
20
- for update_type in update_types:
21
  for gamma in vals_gamma:
22
- for eps in vals_eps:
23
  tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
24
  random.shuffle(tests)
25
 
 
2
  import multiprocessing
3
  import random
4
 
5
+ wandb_project = "cs581"
6
+
7
+ env = "FrozenLake-v1"
8
+ n_train_episodes = 5000
9
+ max_steps = 200
10
+
11
  num_tests = 10
12
 
13
+ vals_update_type = ["first_visit", "every_visit"]
14
+ vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
15
+ vals_gamma = [1.0, 0.98, 0.96, 0.94]
16
 
17
 
18
  def run_test(args):
19
  os.system(
20
+ f"python3 MonteCarloAgent.py --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
21
  )
22
 
23
 
24
+ with multiprocessing.Pool(16) as p:
25
  tests = []
26
+ for update_type in vals_update_type:
27
  for gamma in vals_gamma:
28
+ for eps in vals_epsilon:
29
  tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
30
  random.shuffle(tests)
31