Andrei Cozma commited on
Commit
8ceccef
·
1 Parent(s): b8a5776
MonteCarloAgent.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  import gymnasium as gym
3
  from tqdm import tqdm
@@ -8,6 +9,7 @@ import wandb
8
 
9
  class MonteCarloAgent:
10
  def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
 
11
  print(f"# MonteCarloAgent - {env_name}")
12
  print(f"- epsilon: {epsilon}")
13
  print(f"- gamma: {gamma}")
@@ -45,25 +47,68 @@ class MonteCarloAgent:
45
  # Sample an action from the policy
46
  return np.random.choice(self.n_actions, p=self.Pi[state])
47
 
48
- def run_episode(self, max_steps=500, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  state, _ = self.env.reset()
50
- episode_hist = []
51
- finished = False
52
  # Generate an episode following the current policy
53
  for _ in range(max_steps):
 
54
  # Sample an action from the policy
55
  action = self.choose_action(state)
56
  # Take the action and observe the reward and next state
57
- next_state, reward, finished, truncated, _ = self.env.step(action)
58
  # Keeping track of the trajectory
59
  episode_hist.append((state, action, reward))
60
  state = next_state
 
 
 
61
  # This is where the agent got to the goal.
62
  # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
63
- if finished or truncated:
 
64
  break
65
 
66
- return episode_hist, finished
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def update_first_visit(self, episode_hist):
69
  G = 0
@@ -127,11 +172,11 @@ class MonteCarloAgent:
127
  self.wandb_log_img(episode=None)
128
 
129
  for e in tqrange:
130
- episode_hist, finished = self.run_episode(**kwargs)
131
  rewards = [x[2] for x in episode_hist]
132
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
133
  train_running_success_rate = (
134
- 0.99 * train_running_success_rate + 0.01 * finished
135
  )
136
  update_func(episode_hist)
137
 
@@ -159,11 +204,11 @@ class MonteCarloAgent:
159
  print(f"Testing agent for {n_test_episodes} episodes...")
160
  num_successes = 0
161
  for e in range(n_test_episodes):
162
- _, finished = self.run_episode(**kwargs)
163
- num_successes += finished
164
  if verbose:
165
- word = "reached" if finished else "did not reach"
166
- emoji = "🏁" if finished else "🚫"
167
  print(
168
  f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
169
  )
@@ -175,15 +220,18 @@ class MonteCarloAgent:
175
  )
176
  return success_rate
177
 
178
- def save_policy(self, fname="policy.npy"):
179
- print(f"Saving policy to {fname}")
 
 
 
180
  np.save(fname, self.Pi)
181
 
182
  def load_policy(self, fname="policy.npy"):
183
- print(f"Loading policy from {fname}")
184
  self.Pi = np.load(fname)
185
 
186
- def wandb_log_img(self, episode=None, mask=None):
187
  caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
188
  wandb.log(
189
  {
@@ -248,6 +296,13 @@ def main():
248
  help="The type of update to use. (default: first_visit)",
249
  )
250
 
 
 
 
 
 
 
 
251
  parser.add_argument(
252
  "--no_save",
253
  action="store_true",
@@ -264,7 +319,7 @@ def main():
264
  parser.add_argument(
265
  "--epsilon",
266
  type=float,
267
- default=0.7,
268
  help="The value for the epsilon-greedy policy to use. (default: 0.1)",
269
  )
270
 
@@ -308,14 +363,14 @@ def main():
308
 
309
  args = parser.parse_args()
310
 
311
- mca = MonteCarloAgent(
312
  args.env,
313
  gamma=args.gamma,
314
  epsilon=args.epsilon,
315
  render_mode=args.render_mode,
316
  )
317
 
318
- run_name = f"mc_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
319
  if args.wandb_run_name_suffix is not None:
320
  run_name += f"+{args.wandb_run_name_suffix}"
321
 
@@ -331,7 +386,7 @@ def main():
331
  config=dict(args._get_kwargs()),
332
  )
333
 
334
- mca.train(
335
  n_train_episodes=args.n_train_episodes,
336
  test_every=args.test_every,
337
  n_test_episodes=args.n_test_episodes,
@@ -340,12 +395,15 @@ def main():
340
  log_wandb=args.wandb_project is not None,
341
  )
342
  if not args.no_save:
343
- mca.save_policy(fname=f"policy_{run_name}.npy")
 
 
 
344
  elif args.test is not None:
345
  if not args.test.endswith(".npy"):
346
  args.test += ".npy"
347
- mca.load_policy(args.test)
348
- mca.test(
349
  n_test_episodes=args.n_test_episodes,
350
  max_steps=args.max_steps,
351
  )
 
1
+ import os
2
  import numpy as np
3
  import gymnasium as gym
4
  from tqdm import tqdm
 
9
 
10
  class MonteCarloAgent:
11
  def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
12
+ print("=" * 80)
13
  print(f"# MonteCarloAgent - {env_name}")
14
  print(f"- epsilon: {epsilon}")
15
  print(f"- gamma: {gamma}")
 
47
  # Sample an action from the policy
48
  return np.random.choice(self.n_actions, p=self.Pi[state])
49
 
50
+ # def run_episode(self, max_steps=500, render=False, **kwargs):
51
+ # state, _ = self.env.reset()
52
+ # episode_hist, solved, rgb_array = [], False, None
53
+
54
+ # # Generate an episode following the current policy
55
+ # for _ in range(max_steps):
56
+ # rgb_array = self.env.render() if render else None
57
+ # # Sample an action from the policy
58
+ # action = self.choose_action(state)
59
+ # # Take the action and observe the reward and next state
60
+ # next_state, reward, done, truncated, _ = self.env.step(action)
61
+ # # Keeping track of the trajectory
62
+ # episode_hist.append((state, action, reward))
63
+ # state = next_state
64
+
65
+ # # This is where the agent got to the goal.
66
+ # # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
67
+ # if done:
68
+ # solved = True
69
+ # break
70
+ # if truncated:
71
+ # break
72
+
73
+ # rgb_array = self.env.render() if render else None
74
+
75
+ # return episode_hist, solved, rgb_array
76
+
77
+ def generate_episode(self, max_steps=500, render=False, **kwargs):
78
  state, _ = self.env.reset()
79
+ episode_hist, solved, rgb_array = [], False, None
80
+
81
  # Generate an episode following the current policy
82
  for _ in range(max_steps):
83
+ rgb_array = self.env.render() if render else None
84
  # Sample an action from the policy
85
  action = self.choose_action(state)
86
  # Take the action and observe the reward and next state
87
+ next_state, reward, done, truncated, _ = self.env.step(action)
88
  # Keeping track of the trajectory
89
  episode_hist.append((state, action, reward))
90
  state = next_state
91
+
92
+ yield episode_hist, solved, rgb_array
93
+
94
  # This is where the agent got to the goal.
95
  # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
96
+ if done or truncated:
97
+ solved = True
98
  break
99
 
100
+ rgb_array = self.env.render() if render else None
101
+
102
+ yield episode_hist, solved, rgb_array
103
+
104
+ def run_episode(self, max_steps=500, render=False, **kwargs):
105
+ # Run the generator until the end
106
+ episode_hist, solved, rgb_array = None, False, None
107
+ for episode_hist, solved, rgb_array in self.generate_episode(
108
+ max_steps, render, **kwargs
109
+ ):
110
+ pass
111
+ return episode_hist, solved, rgb_array
112
 
113
  def update_first_visit(self, episode_hist):
114
  G = 0
 
172
  self.wandb_log_img(episode=None)
173
 
174
  for e in tqrange:
175
+ episode_hist, solved, _ = self.run_episode(**kwargs)
176
  rewards = [x[2] for x in episode_hist]
177
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
178
  train_running_success_rate = (
179
+ 0.99 * train_running_success_rate + 0.01 * solved
180
  )
181
  update_func(episode_hist)
182
 
 
204
  print(f"Testing agent for {n_test_episodes} episodes...")
205
  num_successes = 0
206
  for e in range(n_test_episodes):
207
+ _, solved, _ = self.run_episode(**kwargs)
208
+ num_successes += solved
209
  if verbose:
210
+ word = "reached" if solved else "did not reach"
211
+ emoji = "🏁" if solved else "🚫"
212
  print(
213
  f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
214
  )
 
220
  )
221
  return success_rate
222
 
223
+ def save_policy(self, fname="policy.npy", save_dir=None):
224
+ if save_dir is not None:
225
+ os.makedirs(save_dir, exist_ok=True)
226
+ fname = os.path.join(save_dir, fname)
227
+ print(f"Saving policy to: {fname}")
228
  np.save(fname, self.Pi)
229
 
230
  def load_policy(self, fname="policy.npy"):
231
+ print(f"Loading policy from: {fname}")
232
  self.Pi = np.load(fname)
233
 
234
+ def wandb_log_img(self, episode=None):
235
  caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
236
  wandb.log(
237
  {
 
296
  help="The type of update to use. (default: first_visit)",
297
  )
298
 
299
+ parser.add_argument(
300
+ "--save_dir",
301
+ type=str,
302
+ default="policies",
303
+ help="The directory to save the policy to. (default: policies)",
304
+ )
305
+
306
  parser.add_argument(
307
  "--no_save",
308
  action="store_true",
 
319
  parser.add_argument(
320
  "--epsilon",
321
  type=float,
322
+ default=0.5,
323
  help="The value for the epsilon-greedy policy to use. (default: 0.1)",
324
  )
325
 
 
363
 
364
  args = parser.parse_args()
365
 
366
+ agent = MonteCarloAgent(
367
  args.env,
368
  gamma=args.gamma,
369
  epsilon=args.epsilon,
370
  render_mode=args.render_mode,
371
  )
372
 
373
+ run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
374
  if args.wandb_run_name_suffix is not None:
375
  run_name += f"+{args.wandb_run_name_suffix}"
376
 
 
386
  config=dict(args._get_kwargs()),
387
  )
388
 
389
+ agent.train(
390
  n_train_episodes=args.n_train_episodes,
391
  test_every=args.test_every,
392
  n_test_episodes=args.n_test_episodes,
 
395
  log_wandb=args.wandb_project is not None,
396
  )
397
  if not args.no_save:
398
+ agent.save_policy(
399
+ fname=f"{run_name}.npy",
400
+ save_dir=args.save_dir,
401
+ )
402
  elif args.test is not None:
403
  if not args.test.endswith(".npy"):
404
  args.test += ".npy"
405
+ agent.load_policy(args.test)
406
+ agent.test(
407
  n_test_episodes=args.n_test_episodes,
408
  max_steps=args.max_steps,
409
  )
demo.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import numpy as np
4
+ import gradio as gr
5
+ from MonteCarloAgent import MonteCarloAgent
6
+
7
+
8
+ # For the dropdown list of policies
9
+ policies_folder = "policies"
10
+ all_policies = [file for file in os.listdir(policies_folder) if file.endswith(".npy")]
11
+
12
+ # All supported agents
13
+ agent_map = {
14
+ "MonteCarloAgent": MonteCarloAgent,
15
+ # TODO: Add DP Agent
16
+ }
17
+
18
+ # Global variables for the agent and the render fps (to allow changing it on the fly)
19
+ agent = None
20
+ render_fps_val = 5
21
+
22
+
23
+ def load_policy(policy_fname):
24
+ print("Loading...")
25
+ print(f"- policy_fname: {policy_fname}")
26
+ global agent
27
+ policy_path = os.path.join(policies_folder, policy_fname)
28
+ props = policy_fname.split("_")
29
+ agent_type, env_name = props[0], props[1]
30
+
31
+ agent = agent_map[agent_type](env_name, render_mode="rgb_array")
32
+ agent.load_policy(policy_path)
33
+
34
+ return agent.env.spec.id, agent.__class__.__name__
35
+
36
+
37
+ def change_render_fps(x):
38
+ print("Changing render fps:", x)
39
+ global render_fps_val
40
+ render_fps_val = x
41
+
42
+
43
+ def run(n_test_episodes, max_steps, render_fps):
44
+ global agent, render_fps_val
45
+ render_fps_val = render_fps
46
+ print("Running...")
47
+ print(f"- n_test_episodes: {n_test_episodes}")
48
+ print(f"- max_steps: {max_steps}")
49
+ print(f"- render_fps: {render_fps_val}")
50
+
51
+ while agent is None:
52
+ print("Waiting for agent to be loaded...")
53
+ time.sleep(1)
54
+ yield None, None, None, None, None, None, None, "🚫 ERROR: Please load a policy first!"
55
+
56
+ rgb_array = np.random.random((25, 100, 3))
57
+ episode, step = 0, 0
58
+ state, action, reward = 0, 0, 0
59
+ episodes_solved = 0
60
+
61
+ def ep_str(episode):
62
+ return f"{episode + 1} / {n_test_episodes} ({(episode + 1) / n_test_episodes * 100:.2f}%)"
63
+
64
+ def step_str(step):
65
+ return f"{step + 1}"
66
+
67
+ for episode in range(n_test_episodes):
68
+ for step, (episode_hist, solved, rgb_array) in enumerate(
69
+ agent.generate_episode(max_steps=max_steps, render=True)
70
+ ):
71
+ if solved:
72
+ episodes_solved += 1
73
+ state, action, reward = episode_hist[-1]
74
+
75
+ print(
76
+ f"Episode: {ep_str(episode)} - step: {step_str} - state: {state} - action: {action} - reward: {reward} (frame time: {1 / render_fps:.2f}s)"
77
+ )
78
+
79
+ time.sleep(1 / render_fps_val)
80
+ yield rgb_array, ep_str(episode), step_str(step), state, action, reward, ep_str(episodes_solved), "Running..."
81
+
82
+ yield rgb_array, ep_str(episode), step_str(step), state, action, reward, ep_str(episodes_solved), "Done!"
83
+
84
+
85
+ with gr.Blocks() as demo:
86
+ # TODO: Add title and description
87
+
88
+ with gr.Row():
89
+ with gr.Column():
90
+ input_policy = gr.components.Dropdown(
91
+ label="Policy", choices=all_policies, value=all_policies[0]
92
+ )
93
+
94
+ with gr.Row():
95
+ out_environment = gr.components.Textbox(label="Environment")
96
+ out_agent = gr.components.Textbox(label="Agent")
97
+
98
+ btn_load = gr.components.Button("📁 Load")
99
+ btn_load.click(
100
+ fn=load_policy,
101
+ inputs=[input_policy],
102
+ outputs=[out_environment, out_agent],
103
+ )
104
+
105
+ with gr.Column():
106
+ input_n_test_episodes = gr.components.Slider(
107
+ minimum=1,
108
+ maximum=100,
109
+ value=5,
110
+ label="Number of episodes",
111
+ )
112
+ input_max_steps = gr.components.Slider(
113
+ minimum=1,
114
+ maximum=500,
115
+ value=500,
116
+ label="Max steps per episode",
117
+ )
118
+
119
+ input_render_fps = gr.components.Slider(
120
+ minimum=1,
121
+ maximum=60,
122
+ value=5,
123
+ label="Render FPS",
124
+ )
125
+ input_render_fps.change(change_render_fps, inputs=[input_render_fps])
126
+
127
+ btn_run = gr.components.Button("▶️ Run")
128
+
129
+ out_msg = gr.components.Textbox(label="Message")
130
+
131
+ with gr.Row():
132
+ out_episode = gr.components.Textbox(label="Current Episode")
133
+ out_step = gr.components.Textbox(label="Current Step")
134
+ out_state = gr.components.Textbox(label="Current State")
135
+ out_action = gr.components.Textbox(label="Chosen Action")
136
+ out_reward = gr.components.Textbox(label="Reward Received")
137
+ out_eps_solved = gr.components.Textbox(label="Episodes Solved")
138
+
139
+ out_image = gr.components.Image(label="Environment", type="numpy", image_mode="RGB")
140
+
141
+ btn_run.click(
142
+ fn=run,
143
+ inputs=[
144
+ input_n_test_episodes,
145
+ input_max_steps,
146
+ input_render_fps,
147
+ ],
148
+ outputs=[
149
+ out_image,
150
+ out_episode,
151
+ out_step,
152
+ out_state,
153
+ out_action,
154
+ out_reward,
155
+ out_eps_solved,
156
+ out_msg,
157
+ ],
158
+ )
159
+
160
+
161
+ demo.queue(concurrency_count=2)
162
+ demo.launch()
policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.5.npy ADDED
Binary file (1.66 kB). View file
 
run_tests.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import multiprocessing
3
  import random
4
 
5
- num_tests = 10
6
 
7
  update_types = ["first_visit", "every_visit"]
8
  vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]
 
2
  import multiprocessing
3
  import random
4
 
5
+ num_tests = 5
6
 
7
  update_types = ["first_visit", "every_visit"]
8
  vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]