Andrei Cozma commited on
Commit
6ee82fe
·
1 Parent(s): e17747a
MCAgent.py CHANGED
@@ -7,7 +7,7 @@ from Shared import Shared
7
 
8
  class MCAgent(Shared):
9
  def __init__(self, /, **kwargs):
10
- super().__init__(**kwargs)
11
  self.reset()
12
 
13
  def reset(self):
@@ -79,6 +79,7 @@ class MCAgent(Shared):
79
  **kwargs,
80
  ):
81
  print(f"Training agent for {n_train_episodes} episodes...")
 
82
 
83
  (
84
  train_running_success_rate,
@@ -140,7 +141,7 @@ class MCAgent(Shared):
140
  if log_wandb:
141
  wandb.log(stats)
142
 
143
- if test_running_success_rate > 0.999:
144
  if save_best:
145
  if self.run_name is None:
146
  print("WARNING: run_name is None, not saving best policy.")
 
7
 
8
  class MCAgent(Shared):
9
  def __init__(self, /, **kwargs):
10
+ super().__init__(run_name=self.__class__.__name__, **kwargs)
11
  self.reset()
12
 
13
  def reset(self):
 
79
  **kwargs,
80
  ):
81
  print(f"Training agent for {n_train_episodes} episodes...")
82
+ self.run_name = f"{self.run_name}_{update_type}"
83
 
84
  (
85
  train_running_success_rate,
 
141
  if log_wandb:
142
  wandb.log(stats)
143
 
144
+ if test_running_success_rate > 0.99999:
145
  if save_best:
146
  if self.run_name is None:
147
  print("WARNING: run_name is None, not saving best policy.")
Shared.py CHANGED
@@ -12,19 +12,22 @@ class Shared:
12
  gamma=0.99,
13
  epsilon=0.1,
14
  run_name=None,
15
- frozenlake_size=8,
16
  **kwargs,
17
  ):
18
  print("=" * 80)
19
  print(f"# Init Agent - {env}")
20
- print(f"- epsilon: {epsilon}")
21
- print(f"- gamma: {gamma}")
22
- print(f"- run_name: {run_name}")
23
- self.run_name = run_name
24
  self.env_name = env
25
- self.epsilon, self.gamma = epsilon, gamma
 
 
26
  self.epsilon_override = None
27
 
 
 
 
 
28
  self.env_kwargs = {k: v for k, v in kwargs.items() if k in ["render_mode"]}
29
  if self.env_name == "FrozenLake-v1":
30
  # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
@@ -39,7 +42,15 @@ class Shared:
39
  # "FHFFHFHF",
40
  # "FFFHFFFG",
41
  # ]
42
- self.env_kwargs["desc"] = generate_random_map(size=frozenlake_size)
 
 
 
 
 
 
 
 
43
  self.env_kwargs["is_slippery"] = False
44
 
45
  self.env = gym.make(self.env_name, **self.env_kwargs)
@@ -150,13 +161,21 @@ class Shared:
150
  )
151
  return success_rate
152
 
153
- def save_policy(self, fname="policy.npy", save_dir=None):
 
 
 
 
 
154
  if save_dir is not None:
155
  os.makedirs(save_dir, exist_ok=True)
156
  fname = os.path.join(save_dir, fname)
157
- print(f"Saving policy to: {fname}")
 
158
  np.save(fname, self.Pi)
159
 
160
  def load_policy(self, fname="policy.npy"):
161
- print(f"Loading policy from: {fname}")
 
 
162
  self.Pi = np.load(fname)
 
12
  gamma=0.99,
13
  epsilon=0.1,
14
  run_name=None,
15
+ seed=None,
16
  **kwargs,
17
  ):
18
  print("=" * 80)
19
  print(f"# Init Agent - {env}")
20
+
 
 
 
21
  self.env_name = env
22
+ self.epsilon, self.gamma = float(epsilon), float(gamma)
23
+ print(f"- epsilon: {self.epsilon}")
24
+ print(f"- gamma: {self.gamma}")
25
  self.epsilon_override = None
26
 
27
+ self.run_name = f"{run_name}_" if run_name is not None else ""
28
+ self.run_name += f"{env}_gamma:{gamma}_epsilon:{epsilon}"
29
+ print(f"- run_name: {run_name}")
30
+
31
  self.env_kwargs = {k: v for k, v in kwargs.items() if k in ["render_mode"]}
32
  if self.env_name == "FrozenLake-v1":
33
  # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
 
42
  # "FHFFHFHF",
43
  # "FFFHFFFG",
44
  # ]
45
+ size = int(kwargs.get("size", 8))
46
+ print(f"- size: {size}")
47
+ self.run_name += f"_size:{size}"
48
+
49
+ seed = int(seed) if seed is not None else np.random.randint(0, 100000)
50
+ print(f"- seed: {seed}")
51
+ self.run_name += f"_seed:{seed}"
52
+
53
+ self.env_kwargs["desc"] = generate_random_map(size=size, seed=seed)
54
  self.env_kwargs["is_slippery"] = False
55
 
56
  self.env = gym.make(self.env_name, **self.env_kwargs)
 
161
  )
162
  return success_rate
163
 
164
+ def save_policy(self, fname=None, save_dir=None):
165
+ if fname is None and self.run_name is None:
166
+ raise ValueError("Must provide a filename or a run name to save the policy")
167
+ elif fname is None:
168
+ fname = self.run_name
169
+
170
  if save_dir is not None:
171
  os.makedirs(save_dir, exist_ok=True)
172
  fname = os.path.join(save_dir, fname)
173
+
174
+ print(f"Saving policy to: '{fname}'")
175
  np.save(fname, self.Pi)
176
 
177
  def load_policy(self, fname="policy.npy"):
178
+ print(f"Loading policy from: '{fname}'")
179
+ if not fname.endswith(".npy"):
180
+ fname += ".npy"
181
  self.Pi = np.load(fname)
agents.py CHANGED
@@ -2,7 +2,13 @@
2
  from MCAgent import MCAgent
3
  from DPAgent import DPAgent
4
 
5
- AGENTS_MAP = {
6
- "MCAgent": MCAgent,
7
- "DPAgent": DPAgent
8
- }
 
 
 
 
 
 
 
2
  from MCAgent import MCAgent
3
  from DPAgent import DPAgent
4
 
5
+ AGENTS_MAP = {"MCAgent": MCAgent, "DPAgent": DPAgent}
6
+
7
+
8
+ def load_agent(agent_name, **kwargs):
9
+ if agent_name not in AGENTS_MAP:
10
+ raise ValueError(
11
+ f"ERROR: Agent '{agent_name}' not valid. Must be one of: {AGENTS_MAP.keys()}"
12
+ )
13
+
14
+ return AGENTS_MAP[agent_name](**kwargs)
demo.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import time
 
3
  import numpy as np
4
  import gradio as gr
5
 
@@ -23,6 +24,7 @@ try:
23
  all_policies = [
24
  file for file in os.listdir(policies_folder) if file.endswith(".npy")
25
  ]
 
26
  except FileNotFoundError:
27
  print("ERROR: No policies folder found!")
28
  all_policies = []
@@ -70,8 +72,10 @@ def reset(state, policy_fname):
70
  state.live_render_fps = default_render_fps
71
  state.live_epsilon = default_epsilon
72
  state.live_steps_forward = None
73
- return gr.update(value=pause_val_map_inv[not state.live_paused]), gr.update(
74
- interactive=state.live_paused
 
 
75
  )
76
 
77
 
@@ -135,15 +139,32 @@ def run(
135
  policy_path = os.path.join(policies_folder, policy_fname)
136
  props = policy_fname.split("_")
137
 
138
- if len(props) < 2:
 
 
 
 
 
 
 
 
 
 
 
 
139
  yield localstate, None, None, None, None, None, None, None, None, None, None, "🚫 Please select a valid policy file."
140
  return
141
 
142
- agent_type, env_name = props[0], props[1]
143
-
144
- agent = AGENTS_MAP[agent_type](env=env_name, render_mode="rgb_array")
 
 
 
 
 
145
  agent.load_policy(policy_path)
146
- env_action_map = action_map.get(env_name)
147
 
148
  solved, frame_env, frame_policy = None, None, None
149
  episode, step, state, action, reward, last_reward = (
@@ -255,7 +276,7 @@ def run(
255
  f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {localstate.live_epsilon:.2f}) (frame time: {1 / localstate.live_render_fps:.2f}s)"
256
  )
257
 
258
- yield localstate, agent_type, env_name, frame_env, frame_policy, ep_str(
259
  episode + 1
260
  ), ep_str(episodes_solved), step_str(
261
  step
@@ -272,7 +293,7 @@ def run(
272
  time.sleep(1 / localstate.live_render_fps)
273
 
274
  while localstate.live_paused and localstate.live_steps_forward is None:
275
- yield localstate, agent_type, env_name, frame_env, frame_policy, ep_str(
276
  episode + 1
277
  ), ep_str(episodes_solved), step_str(
278
  step
@@ -285,8 +306,8 @@ def run(
285
  localstate.should_reset = False
286
  yield (
287
  localstate,
288
- agent_type,
289
- env_name,
290
  np.ones((frame_env_h, frame_env_w, 3)),
291
  np.ones((frame_policy_h, frame_policy_res)),
292
  ep_str(episode + 1),
@@ -305,7 +326,7 @@ def run(
305
  time.sleep(0.25)
306
 
307
  localstate.current_policy = None
308
- yield localstate, agent_type, env_name, frame_env, frame_policy, ep_str(
309
  episode + 1
310
  ), ep_str(episodes_solved), step_str(step), state, action, reward, "Done!"
311
 
 
1
  import os
2
  import time
3
+ import warnings
4
  import numpy as np
5
  import gradio as gr
6
 
 
24
  all_policies = [
25
  file for file in os.listdir(policies_folder) if file.endswith(".npy")
26
  ]
27
+ all_policies.sort()
28
  except FileNotFoundError:
29
  print("ERROR: No policies folder found!")
30
  all_policies = []
 
72
  state.live_render_fps = default_render_fps
73
  state.live_epsilon = default_epsilon
74
  state.live_steps_forward = None
75
+ return (
76
+ state,
77
+ gr.update(value=pause_val_map_inv[not state.live_paused]),
78
+ gr.update(interactive=state.live_paused),
79
  )
80
 
81
 
 
139
  policy_path = os.path.join(policies_folder, policy_fname)
140
  props = policy_fname.split("_")
141
 
142
+ try:
143
+ agent_key, env_key = props[0], props[1]
144
+ agent_args = {}
145
+ for prop in props[2:]:
146
+ props_split = prop.split(":")
147
+ if len(props_split) == 2:
148
+ agent_args[props_split[0]] = props_split[1]
149
+ else:
150
+ warnings.warn(
151
+ f"Skipping property {prop} as it does not have the format 'key:value'.",
152
+ UserWarning,
153
+ )
154
+ except IndexError:
155
  yield localstate, None, None, None, None, None, None, None, None, None, None, "🚫 Please select a valid policy file."
156
  return
157
 
158
+ agent_args.update(
159
+ {
160
+ "env": env_key,
161
+ "render_mode": "rgb_array",
162
+ }
163
+ )
164
+ print("agent_args:", agent_args)
165
+ agent = AGENTS_MAP[agent_key](**agent_args)
166
  agent.load_policy(policy_path)
167
+ env_action_map = action_map.get(env_key)
168
 
169
  solved, frame_env, frame_policy = None, None, None
170
  episode, step, state, action, reward, last_reward = (
 
276
  f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {localstate.live_epsilon:.2f}) (frame time: {1 / localstate.live_render_fps:.2f}s)"
277
  )
278
 
279
+ yield localstate, agent_key, env_key, frame_env, frame_policy, ep_str(
280
  episode + 1
281
  ), ep_str(episodes_solved), step_str(
282
  step
 
293
  time.sleep(1 / localstate.live_render_fps)
294
 
295
  while localstate.live_paused and localstate.live_steps_forward is None:
296
+ yield localstate, agent_key, env_key, frame_env, frame_policy, ep_str(
297
  episode + 1
298
  ), ep_str(episodes_solved), step_str(
299
  step
 
306
  localstate.should_reset = False
307
  yield (
308
  localstate,
309
+ agent_key,
310
+ env_key,
311
  np.ones((frame_env_h, frame_env_w, 3)),
312
  np.ones((frame_policy_h, frame_policy_res)),
313
  ep_str(episode + 1),
 
326
  time.sleep(0.25)
327
 
328
  localstate.current_policy = None
329
+ yield localstate, agent_key, env_key, frame_env, frame_policy, ep_str(
330
  episode + 1
331
  ), ep_str(episodes_solved), step_str(step), state, action, reward, "Done!"
332
 
policies/{MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:27843_e2500_s200_first_visit.npy} RENAMED
Binary files a/policies/MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy and b/policies/MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:27843_e2500_s200_first_visit.npy differ
 
policies/{MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:84740_e2500_s200_first_visit.npy} RENAMED
Binary files a/policies/MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.4_first_visit.npy and b/policies/MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:84740_e2500_s200_first_visit.npy differ
 
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
  gradio==3.27.0
2
  gymnasium>=0.28.1
3
- numpy==1.21.5
4
  opencv_python_headless==4.6.0.66
5
- pip==22.0.2
6
  scipy==1.8.0
7
  tabulate==0.9.0
8
  tqdm==4.64.1
 
1
  gradio==3.27.0
2
  gymnasium>=0.28.1
3
+ numpy>=1.23
4
  opencv_python_headless==4.6.0.66
 
5
  scipy==1.8.0
6
  tabulate==0.9.0
7
  tqdm==4.64.1
run.py CHANGED
@@ -1,7 +1,7 @@
1
  import argparse
2
  import wandb
3
 
4
- from agents import AGENTS_MAP
5
 
6
 
7
  def main():
@@ -96,6 +96,20 @@ def main():
96
  help="The Gymnasium environment to use. (default: CliffWalking-v0)",
97
  )
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  parser.add_argument(
100
  "--render_mode",
101
  type=str,
@@ -123,13 +137,12 @@ def main():
123
 
124
  args = parser.parse_args()
125
  print(vars(args))
126
- agent = AGENTS_MAP[args.agent](**dict(args._get_kwargs()))
127
 
128
- run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
129
- if args.wandb_run_name_suffix is not None:
130
- run_name += f"+{args.wandb_run_name_suffix}"
131
 
132
- agent.run_name = run_name
 
 
133
 
134
  try:
135
  if args.train:
@@ -137,7 +150,7 @@ def main():
137
  if args.wandb_project is not None:
138
  wandb.init(
139
  project=args.wandb_project,
140
- name=run_name,
141
  group=args.agent,
142
  job_type=args.wandb_job_type,
143
  config=dict(args._get_kwargs()),
@@ -154,13 +167,8 @@ def main():
154
  save_best_dir=args.save_dir,
155
  )
156
  if not args.no_save:
157
- agent.save_policy(
158
- fname=f"{run_name}.npy",
159
- save_dir=args.save_dir,
160
- )
161
  elif args.test is not None:
162
- if not args.test.endswith(".npy"):
163
- args.test += ".npy"
164
  agent.load_policy(args.test)
165
  agent.test(
166
  n_test_episodes=args.n_test_episodes,
 
1
  import argparse
2
  import wandb
3
 
4
+ from agents import AGENTS_MAP, load_agent
5
 
6
 
7
  def main():
 
96
  help="The Gymnasium environment to use. (default: CliffWalking-v0)",
97
  )
98
 
99
+ parser.add_argument(
100
+ "--seed",
101
+ type=int,
102
+ default=None,
103
+ help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
104
+ )
105
+
106
+ parser.add_argument(
107
+ "--size",
108
+ type=int,
109
+ default=8,
110
+ help="The size to use when generating the FrozenLake environment. (default: 8)",
111
+ )
112
+
113
  parser.add_argument(
114
  "--render_mode",
115
  type=str,
 
137
 
138
  args = parser.parse_args()
139
  print(vars(args))
 
140
 
141
+ agent = load_agent(args.agent, **dict(args._get_kwargs()))
 
 
142
 
143
+ agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
144
+ if args.wandb_run_name_suffix is not None:
145
+ agent.run_name += f"+{args.wandb_run_name_suffix}"
146
 
147
  try:
148
  if args.train:
 
150
  if args.wandb_project is not None:
151
  wandb.init(
152
  project=args.wandb_project,
153
+ name=agent.run_name,
154
  group=args.agent,
155
  job_type=args.wandb_job_type,
156
  config=dict(args._get_kwargs()),
 
167
  save_best_dir=args.save_dir,
168
  )
169
  if not args.no_save:
170
+ agent.save_policy(save_dir=args.save_dir)
 
 
 
171
  elif args.test is not None:
 
 
172
  agent.load_policy(args.test)
173
  agent.test(
174
  n_test_episodes=args.n_test_episodes,