Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
6ee82fe
1
Parent(s):
e17747a
Updates
Browse files- MCAgent.py +3 -2
- Shared.py +29 -10
- agents.py +10 -4
- demo.py +33 -12
- policies/{MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:27843_e2500_s200_first_visit.npy} +0 -0
- policies/{MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:84740_e2500_s200_first_visit.npy} +0 -0
- requirements.txt +1 -2
- run.py +21 -13
MCAgent.py
CHANGED
@@ -7,7 +7,7 @@ from Shared import Shared
|
|
7 |
|
8 |
class MCAgent(Shared):
|
9 |
def __init__(self, /, **kwargs):
|
10 |
-
super().__init__(**kwargs)
|
11 |
self.reset()
|
12 |
|
13 |
def reset(self):
|
@@ -79,6 +79,7 @@ class MCAgent(Shared):
|
|
79 |
**kwargs,
|
80 |
):
|
81 |
print(f"Training agent for {n_train_episodes} episodes...")
|
|
|
82 |
|
83 |
(
|
84 |
train_running_success_rate,
|
@@ -140,7 +141,7 @@ class MCAgent(Shared):
|
|
140 |
if log_wandb:
|
141 |
wandb.log(stats)
|
142 |
|
143 |
-
if test_running_success_rate > 0.
|
144 |
if save_best:
|
145 |
if self.run_name is None:
|
146 |
print("WARNING: run_name is None, not saving best policy.")
|
|
|
7 |
|
8 |
class MCAgent(Shared):
|
9 |
def __init__(self, /, **kwargs):
|
10 |
+
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
11 |
self.reset()
|
12 |
|
13 |
def reset(self):
|
|
|
79 |
**kwargs,
|
80 |
):
|
81 |
print(f"Training agent for {n_train_episodes} episodes...")
|
82 |
+
self.run_name = f"{self.run_name}_{update_type}"
|
83 |
|
84 |
(
|
85 |
train_running_success_rate,
|
|
|
141 |
if log_wandb:
|
142 |
wandb.log(stats)
|
143 |
|
144 |
+
if test_running_success_rate > 0.99999:
|
145 |
if save_best:
|
146 |
if self.run_name is None:
|
147 |
print("WARNING: run_name is None, not saving best policy.")
|
Shared.py
CHANGED
@@ -12,19 +12,22 @@ class Shared:
|
|
12 |
gamma=0.99,
|
13 |
epsilon=0.1,
|
14 |
run_name=None,
|
15 |
-
|
16 |
**kwargs,
|
17 |
):
|
18 |
print("=" * 80)
|
19 |
print(f"# Init Agent - {env}")
|
20 |
-
|
21 |
-
print(f"- gamma: {gamma}")
|
22 |
-
print(f"- run_name: {run_name}")
|
23 |
-
self.run_name = run_name
|
24 |
self.env_name = env
|
25 |
-
self.epsilon, self.gamma = epsilon, gamma
|
|
|
|
|
26 |
self.epsilon_override = None
|
27 |
|
|
|
|
|
|
|
|
|
28 |
self.env_kwargs = {k: v for k, v in kwargs.items() if k in ["render_mode"]}
|
29 |
if self.env_name == "FrozenLake-v1":
|
30 |
# Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
|
@@ -39,7 +42,15 @@ class Shared:
|
|
39 |
# "FHFFHFHF",
|
40 |
# "FFFHFFFG",
|
41 |
# ]
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
self.env_kwargs["is_slippery"] = False
|
44 |
|
45 |
self.env = gym.make(self.env_name, **self.env_kwargs)
|
@@ -150,13 +161,21 @@ class Shared:
|
|
150 |
)
|
151 |
return success_rate
|
152 |
|
153 |
-
def save_policy(self, fname=
|
|
|
|
|
|
|
|
|
|
|
154 |
if save_dir is not None:
|
155 |
os.makedirs(save_dir, exist_ok=True)
|
156 |
fname = os.path.join(save_dir, fname)
|
157 |
-
|
|
|
158 |
np.save(fname, self.Pi)
|
159 |
|
160 |
def load_policy(self, fname="policy.npy"):
|
161 |
-
print(f"Loading policy from: {fname}")
|
|
|
|
|
162 |
self.Pi = np.load(fname)
|
|
|
12 |
gamma=0.99,
|
13 |
epsilon=0.1,
|
14 |
run_name=None,
|
15 |
+
seed=None,
|
16 |
**kwargs,
|
17 |
):
|
18 |
print("=" * 80)
|
19 |
print(f"# Init Agent - {env}")
|
20 |
+
|
|
|
|
|
|
|
21 |
self.env_name = env
|
22 |
+
self.epsilon, self.gamma = float(epsilon), float(gamma)
|
23 |
+
print(f"- epsilon: {self.epsilon}")
|
24 |
+
print(f"- gamma: {self.gamma}")
|
25 |
self.epsilon_override = None
|
26 |
|
27 |
+
self.run_name = f"{run_name}_" if run_name is not None else ""
|
28 |
+
self.run_name += f"{env}_gamma:{gamma}_epsilon:{epsilon}"
|
29 |
+
print(f"- run_name: {run_name}")
|
30 |
+
|
31 |
self.env_kwargs = {k: v for k, v in kwargs.items() if k in ["render_mode"]}
|
32 |
if self.env_name == "FrozenLake-v1":
|
33 |
# Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
|
|
|
42 |
# "FHFFHFHF",
|
43 |
# "FFFHFFFG",
|
44 |
# ]
|
45 |
+
size = int(kwargs.get("size", 8))
|
46 |
+
print(f"- size: {size}")
|
47 |
+
self.run_name += f"_size:{size}"
|
48 |
+
|
49 |
+
seed = int(seed) if seed is not None else np.random.randint(0, 100000)
|
50 |
+
print(f"- seed: {seed}")
|
51 |
+
self.run_name += f"_seed:{seed}"
|
52 |
+
|
53 |
+
self.env_kwargs["desc"] = generate_random_map(size=size, seed=seed)
|
54 |
self.env_kwargs["is_slippery"] = False
|
55 |
|
56 |
self.env = gym.make(self.env_name, **self.env_kwargs)
|
|
|
161 |
)
|
162 |
return success_rate
|
163 |
|
164 |
+
def save_policy(self, fname=None, save_dir=None):
|
165 |
+
if fname is None and self.run_name is None:
|
166 |
+
raise ValueError("Must provide a filename or a run name to save the policy")
|
167 |
+
elif fname is None:
|
168 |
+
fname = self.run_name
|
169 |
+
|
170 |
if save_dir is not None:
|
171 |
os.makedirs(save_dir, exist_ok=True)
|
172 |
fname = os.path.join(save_dir, fname)
|
173 |
+
|
174 |
+
print(f"Saving policy to: '{fname}'")
|
175 |
np.save(fname, self.Pi)
|
176 |
|
177 |
def load_policy(self, fname="policy.npy"):
|
178 |
+
print(f"Loading policy from: '{fname}'")
|
179 |
+
if not fname.endswith(".npy"):
|
180 |
+
fname += ".npy"
|
181 |
self.Pi = np.load(fname)
|
agents.py
CHANGED
@@ -2,7 +2,13 @@
|
|
2 |
from MCAgent import MCAgent
|
3 |
from DPAgent import DPAgent
|
4 |
|
5 |
-
AGENTS_MAP = {
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from MCAgent import MCAgent
|
3 |
from DPAgent import DPAgent
|
4 |
|
5 |
+
AGENTS_MAP = {"MCAgent": MCAgent, "DPAgent": DPAgent}
|
6 |
+
|
7 |
+
|
8 |
+
def load_agent(agent_name, **kwargs):
|
9 |
+
if agent_name not in AGENTS_MAP:
|
10 |
+
raise ValueError(
|
11 |
+
f"ERROR: Agent '{agent_name}' not valid. Must be one of: {AGENTS_MAP.keys()}"
|
12 |
+
)
|
13 |
+
|
14 |
+
return AGENTS_MAP[agent_name](**kwargs)
|
demo.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
5 |
|
@@ -23,6 +24,7 @@ try:
|
|
23 |
all_policies = [
|
24 |
file for file in os.listdir(policies_folder) if file.endswith(".npy")
|
25 |
]
|
|
|
26 |
except FileNotFoundError:
|
27 |
print("ERROR: No policies folder found!")
|
28 |
all_policies = []
|
@@ -70,8 +72,10 @@ def reset(state, policy_fname):
|
|
70 |
state.live_render_fps = default_render_fps
|
71 |
state.live_epsilon = default_epsilon
|
72 |
state.live_steps_forward = None
|
73 |
-
return
|
74 |
-
|
|
|
|
|
75 |
)
|
76 |
|
77 |
|
@@ -135,15 +139,32 @@ def run(
|
|
135 |
policy_path = os.path.join(policies_folder, policy_fname)
|
136 |
props = policy_fname.split("_")
|
137 |
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
yield localstate, None, None, None, None, None, None, None, None, None, None, "🚫 Please select a valid policy file."
|
140 |
return
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
145 |
agent.load_policy(policy_path)
|
146 |
-
env_action_map = action_map.get(
|
147 |
|
148 |
solved, frame_env, frame_policy = None, None, None
|
149 |
episode, step, state, action, reward, last_reward = (
|
@@ -255,7 +276,7 @@ def run(
|
|
255 |
f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {localstate.live_epsilon:.2f}) (frame time: {1 / localstate.live_render_fps:.2f}s)"
|
256 |
)
|
257 |
|
258 |
-
yield localstate,
|
259 |
episode + 1
|
260 |
), ep_str(episodes_solved), step_str(
|
261 |
step
|
@@ -272,7 +293,7 @@ def run(
|
|
272 |
time.sleep(1 / localstate.live_render_fps)
|
273 |
|
274 |
while localstate.live_paused and localstate.live_steps_forward is None:
|
275 |
-
yield localstate,
|
276 |
episode + 1
|
277 |
), ep_str(episodes_solved), step_str(
|
278 |
step
|
@@ -285,8 +306,8 @@ def run(
|
|
285 |
localstate.should_reset = False
|
286 |
yield (
|
287 |
localstate,
|
288 |
-
|
289 |
-
|
290 |
np.ones((frame_env_h, frame_env_w, 3)),
|
291 |
np.ones((frame_policy_h, frame_policy_res)),
|
292 |
ep_str(episode + 1),
|
@@ -305,7 +326,7 @@ def run(
|
|
305 |
time.sleep(0.25)
|
306 |
|
307 |
localstate.current_policy = None
|
308 |
-
yield localstate,
|
309 |
episode + 1
|
310 |
), ep_str(episodes_solved), step_str(step), state, action, reward, "Done!"
|
311 |
|
|
|
1 |
import os
|
2 |
import time
|
3 |
+
import warnings
|
4 |
import numpy as np
|
5 |
import gradio as gr
|
6 |
|
|
|
24 |
all_policies = [
|
25 |
file for file in os.listdir(policies_folder) if file.endswith(".npy")
|
26 |
]
|
27 |
+
all_policies.sort()
|
28 |
except FileNotFoundError:
|
29 |
print("ERROR: No policies folder found!")
|
30 |
all_policies = []
|
|
|
72 |
state.live_render_fps = default_render_fps
|
73 |
state.live_epsilon = default_epsilon
|
74 |
state.live_steps_forward = None
|
75 |
+
return (
|
76 |
+
state,
|
77 |
+
gr.update(value=pause_val_map_inv[not state.live_paused]),
|
78 |
+
gr.update(interactive=state.live_paused),
|
79 |
)
|
80 |
|
81 |
|
|
|
139 |
policy_path = os.path.join(policies_folder, policy_fname)
|
140 |
props = policy_fname.split("_")
|
141 |
|
142 |
+
try:
|
143 |
+
agent_key, env_key = props[0], props[1]
|
144 |
+
agent_args = {}
|
145 |
+
for prop in props[2:]:
|
146 |
+
props_split = prop.split(":")
|
147 |
+
if len(props_split) == 2:
|
148 |
+
agent_args[props_split[0]] = props_split[1]
|
149 |
+
else:
|
150 |
+
warnings.warn(
|
151 |
+
f"Skipping property {prop} as it does not have the format 'key:value'.",
|
152 |
+
UserWarning,
|
153 |
+
)
|
154 |
+
except IndexError:
|
155 |
yield localstate, None, None, None, None, None, None, None, None, None, None, "🚫 Please select a valid policy file."
|
156 |
return
|
157 |
|
158 |
+
agent_args.update(
|
159 |
+
{
|
160 |
+
"env": env_key,
|
161 |
+
"render_mode": "rgb_array",
|
162 |
+
}
|
163 |
+
)
|
164 |
+
print("agent_args:", agent_args)
|
165 |
+
agent = AGENTS_MAP[agent_key](**agent_args)
|
166 |
agent.load_policy(policy_path)
|
167 |
+
env_action_map = action_map.get(env_key)
|
168 |
|
169 |
solved, frame_env, frame_policy = None, None, None
|
170 |
episode, step, state, action, reward, last_reward = (
|
|
|
276 |
f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {localstate.live_epsilon:.2f}) (frame time: {1 / localstate.live_render_fps:.2f}s)"
|
277 |
)
|
278 |
|
279 |
+
yield localstate, agent_key, env_key, frame_env, frame_policy, ep_str(
|
280 |
episode + 1
|
281 |
), ep_str(episodes_solved), step_str(
|
282 |
step
|
|
|
293 |
time.sleep(1 / localstate.live_render_fps)
|
294 |
|
295 |
while localstate.live_paused and localstate.live_steps_forward is None:
|
296 |
+
yield localstate, agent_key, env_key, frame_env, frame_policy, ep_str(
|
297 |
episode + 1
|
298 |
), ep_str(episodes_solved), step_str(
|
299 |
step
|
|
|
306 |
localstate.should_reset = False
|
307 |
yield (
|
308 |
localstate,
|
309 |
+
agent_key,
|
310 |
+
env_key,
|
311 |
np.ones((frame_env_h, frame_env_w, 3)),
|
312 |
np.ones((frame_policy_h, frame_policy_res)),
|
313 |
ep_str(episode + 1),
|
|
|
326 |
time.sleep(0.25)
|
327 |
|
328 |
localstate.current_policy = None
|
329 |
+
yield localstate, agent_key, env_key, frame_env, frame_policy, ep_str(
|
330 |
episode + 1
|
331 |
), ep_str(episodes_solved), step_str(step), state, action, reward, "Done!"
|
332 |
|
policies/{MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:27843_e2500_s200_first_visit.npy}
RENAMED
Binary files a/policies/MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy and b/policies/MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:27843_e2500_s200_first_visit.npy differ
|
|
policies/{MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:84740_e2500_s200_first_visit.npy}
RENAMED
Binary files a/policies/MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.4_first_visit.npy and b/policies/MCAgent_FrozenLake-v1_gamma:0.99_epsilon:0.4_size:8_seed:84740_e2500_s200_first_visit.npy differ
|
|
requirements.txt
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
gradio==3.27.0
|
2 |
gymnasium>=0.28.1
|
3 |
-
numpy
|
4 |
opencv_python_headless==4.6.0.66
|
5 |
-
pip==22.0.2
|
6 |
scipy==1.8.0
|
7 |
tabulate==0.9.0
|
8 |
tqdm==4.64.1
|
|
|
1 |
gradio==3.27.0
|
2 |
gymnasium>=0.28.1
|
3 |
+
numpy>=1.23
|
4 |
opencv_python_headless==4.6.0.66
|
|
|
5 |
scipy==1.8.0
|
6 |
tabulate==0.9.0
|
7 |
tqdm==4.64.1
|
run.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import argparse
|
2 |
import wandb
|
3 |
|
4 |
-
from agents import AGENTS_MAP
|
5 |
|
6 |
|
7 |
def main():
|
@@ -96,6 +96,20 @@ def main():
|
|
96 |
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
97 |
)
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
parser.add_argument(
|
100 |
"--render_mode",
|
101 |
type=str,
|
@@ -123,13 +137,12 @@ def main():
|
|
123 |
|
124 |
args = parser.parse_args()
|
125 |
print(vars(args))
|
126 |
-
agent = AGENTS_MAP[args.agent](**dict(args._get_kwargs()))
|
127 |
|
128 |
-
|
129 |
-
if args.wandb_run_name_suffix is not None:
|
130 |
-
run_name += f"+{args.wandb_run_name_suffix}"
|
131 |
|
132 |
-
agent.run_name
|
|
|
|
|
133 |
|
134 |
try:
|
135 |
if args.train:
|
@@ -137,7 +150,7 @@ def main():
|
|
137 |
if args.wandb_project is not None:
|
138 |
wandb.init(
|
139 |
project=args.wandb_project,
|
140 |
-
name=run_name,
|
141 |
group=args.agent,
|
142 |
job_type=args.wandb_job_type,
|
143 |
config=dict(args._get_kwargs()),
|
@@ -154,13 +167,8 @@ def main():
|
|
154 |
save_best_dir=args.save_dir,
|
155 |
)
|
156 |
if not args.no_save:
|
157 |
-
agent.save_policy(
|
158 |
-
fname=f"{run_name}.npy",
|
159 |
-
save_dir=args.save_dir,
|
160 |
-
)
|
161 |
elif args.test is not None:
|
162 |
-
if not args.test.endswith(".npy"):
|
163 |
-
args.test += ".npy"
|
164 |
agent.load_policy(args.test)
|
165 |
agent.test(
|
166 |
n_test_episodes=args.n_test_episodes,
|
|
|
1 |
import argparse
|
2 |
import wandb
|
3 |
|
4 |
+
from agents import AGENTS_MAP, load_agent
|
5 |
|
6 |
|
7 |
def main():
|
|
|
96 |
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
97 |
)
|
98 |
|
99 |
+
parser.add_argument(
|
100 |
+
"--seed",
|
101 |
+
type=int,
|
102 |
+
default=None,
|
103 |
+
help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
|
104 |
+
)
|
105 |
+
|
106 |
+
parser.add_argument(
|
107 |
+
"--size",
|
108 |
+
type=int,
|
109 |
+
default=8,
|
110 |
+
help="The size to use when generating the FrozenLake environment. (default: 8)",
|
111 |
+
)
|
112 |
+
|
113 |
parser.add_argument(
|
114 |
"--render_mode",
|
115 |
type=str,
|
|
|
137 |
|
138 |
args = parser.parse_args()
|
139 |
print(vars(args))
|
|
|
140 |
|
141 |
+
agent = load_agent(args.agent, **dict(args._get_kwargs()))
|
|
|
|
|
142 |
|
143 |
+
agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
|
144 |
+
if args.wandb_run_name_suffix is not None:
|
145 |
+
agent.run_name += f"+{args.wandb_run_name_suffix}"
|
146 |
|
147 |
try:
|
148 |
if args.train:
|
|
|
150 |
if args.wandb_project is not None:
|
151 |
wandb.init(
|
152 |
project=args.wandb_project,
|
153 |
+
name=agent.run_name,
|
154 |
group=args.agent,
|
155 |
job_type=args.wandb_job_type,
|
156 |
config=dict(args._get_kwargs()),
|
|
|
167 |
save_best_dir=args.save_dir,
|
168 |
)
|
169 |
if not args.no_save:
|
170 |
+
agent.save_policy(save_dir=args.save_dir)
|
|
|
|
|
|
|
171 |
elif args.test is not None:
|
|
|
|
|
172 |
agent.load_policy(args.test)
|
173 |
agent.test(
|
174 |
n_test_episodes=args.n_test_episodes,
|