Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
8ceccef
1
Parent(s):
b8a5776
Updates
Browse files- MonteCarloAgent.py +81 -23
- demo.py +162 -0
- policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.5.npy +0 -0
- run_tests.py +1 -1
MonteCarloAgent.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import numpy as np
|
2 |
import gymnasium as gym
|
3 |
from tqdm import tqdm
|
@@ -8,6 +9,7 @@ import wandb
|
|
8 |
|
9 |
class MonteCarloAgent:
|
10 |
def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
|
|
|
11 |
print(f"# MonteCarloAgent - {env_name}")
|
12 |
print(f"- epsilon: {epsilon}")
|
13 |
print(f"- gamma: {gamma}")
|
@@ -45,25 +47,68 @@ class MonteCarloAgent:
|
|
45 |
# Sample an action from the policy
|
46 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
47 |
|
48 |
-
def run_episode(self, max_steps=500, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
state, _ = self.env.reset()
|
50 |
-
episode_hist = []
|
51 |
-
|
52 |
# Generate an episode following the current policy
|
53 |
for _ in range(max_steps):
|
|
|
54 |
# Sample an action from the policy
|
55 |
action = self.choose_action(state)
|
56 |
# Take the action and observe the reward and next state
|
57 |
-
next_state, reward,
|
58 |
# Keeping track of the trajectory
|
59 |
episode_hist.append((state, action, reward))
|
60 |
state = next_state
|
|
|
|
|
|
|
61 |
# This is where the agent got to the goal.
|
62 |
# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
|
63 |
-
if
|
|
|
64 |
break
|
65 |
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def update_first_visit(self, episode_hist):
|
69 |
G = 0
|
@@ -127,11 +172,11 @@ class MonteCarloAgent:
|
|
127 |
self.wandb_log_img(episode=None)
|
128 |
|
129 |
for e in tqrange:
|
130 |
-
episode_hist,
|
131 |
rewards = [x[2] for x in episode_hist]
|
132 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
133 |
train_running_success_rate = (
|
134 |
-
0.99 * train_running_success_rate + 0.01 *
|
135 |
)
|
136 |
update_func(episode_hist)
|
137 |
|
@@ -159,11 +204,11 @@ class MonteCarloAgent:
|
|
159 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
160 |
num_successes = 0
|
161 |
for e in range(n_test_episodes):
|
162 |
-
_,
|
163 |
-
num_successes +=
|
164 |
if verbose:
|
165 |
-
word = "reached" if
|
166 |
-
emoji = "🏁" if
|
167 |
print(
|
168 |
f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
|
169 |
)
|
@@ -175,15 +220,18 @@ class MonteCarloAgent:
|
|
175 |
)
|
176 |
return success_rate
|
177 |
|
178 |
-
def save_policy(self, fname="policy.npy"):
|
179 |
-
|
|
|
|
|
|
|
180 |
np.save(fname, self.Pi)
|
181 |
|
182 |
def load_policy(self, fname="policy.npy"):
|
183 |
-
print(f"Loading policy from {fname}")
|
184 |
self.Pi = np.load(fname)
|
185 |
|
186 |
-
def wandb_log_img(self, episode=None
|
187 |
caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
|
188 |
wandb.log(
|
189 |
{
|
@@ -248,6 +296,13 @@ def main():
|
|
248 |
help="The type of update to use. (default: first_visit)",
|
249 |
)
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
parser.add_argument(
|
252 |
"--no_save",
|
253 |
action="store_true",
|
@@ -264,7 +319,7 @@ def main():
|
|
264 |
parser.add_argument(
|
265 |
"--epsilon",
|
266 |
type=float,
|
267 |
-
default=0.
|
268 |
help="The value for the epsilon-greedy policy to use. (default: 0.1)",
|
269 |
)
|
270 |
|
@@ -308,14 +363,14 @@ def main():
|
|
308 |
|
309 |
args = parser.parse_args()
|
310 |
|
311 |
-
|
312 |
args.env,
|
313 |
gamma=args.gamma,
|
314 |
epsilon=args.epsilon,
|
315 |
render_mode=args.render_mode,
|
316 |
)
|
317 |
|
318 |
-
run_name = f"
|
319 |
if args.wandb_run_name_suffix is not None:
|
320 |
run_name += f"+{args.wandb_run_name_suffix}"
|
321 |
|
@@ -331,7 +386,7 @@ def main():
|
|
331 |
config=dict(args._get_kwargs()),
|
332 |
)
|
333 |
|
334 |
-
|
335 |
n_train_episodes=args.n_train_episodes,
|
336 |
test_every=args.test_every,
|
337 |
n_test_episodes=args.n_test_episodes,
|
@@ -340,12 +395,15 @@ def main():
|
|
340 |
log_wandb=args.wandb_project is not None,
|
341 |
)
|
342 |
if not args.no_save:
|
343 |
-
|
|
|
|
|
|
|
344 |
elif args.test is not None:
|
345 |
if not args.test.endswith(".npy"):
|
346 |
args.test += ".npy"
|
347 |
-
|
348 |
-
|
349 |
n_test_episodes=args.n_test_episodes,
|
350 |
max_steps=args.max_steps,
|
351 |
)
|
|
|
1 |
+
import os
|
2 |
import numpy as np
|
3 |
import gymnasium as gym
|
4 |
from tqdm import tqdm
|
|
|
9 |
|
10 |
class MonteCarloAgent:
|
11 |
def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
|
12 |
+
print("=" * 80)
|
13 |
print(f"# MonteCarloAgent - {env_name}")
|
14 |
print(f"- epsilon: {epsilon}")
|
15 |
print(f"- gamma: {gamma}")
|
|
|
47 |
# Sample an action from the policy
|
48 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
49 |
|
50 |
+
# def run_episode(self, max_steps=500, render=False, **kwargs):
|
51 |
+
# state, _ = self.env.reset()
|
52 |
+
# episode_hist, solved, rgb_array = [], False, None
|
53 |
+
|
54 |
+
# # Generate an episode following the current policy
|
55 |
+
# for _ in range(max_steps):
|
56 |
+
# rgb_array = self.env.render() if render else None
|
57 |
+
# # Sample an action from the policy
|
58 |
+
# action = self.choose_action(state)
|
59 |
+
# # Take the action and observe the reward and next state
|
60 |
+
# next_state, reward, done, truncated, _ = self.env.step(action)
|
61 |
+
# # Keeping track of the trajectory
|
62 |
+
# episode_hist.append((state, action, reward))
|
63 |
+
# state = next_state
|
64 |
+
|
65 |
+
# # This is where the agent got to the goal.
|
66 |
+
# # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
|
67 |
+
# if done:
|
68 |
+
# solved = True
|
69 |
+
# break
|
70 |
+
# if truncated:
|
71 |
+
# break
|
72 |
+
|
73 |
+
# rgb_array = self.env.render() if render else None
|
74 |
+
|
75 |
+
# return episode_hist, solved, rgb_array
|
76 |
+
|
77 |
+
def generate_episode(self, max_steps=500, render=False, **kwargs):
|
78 |
state, _ = self.env.reset()
|
79 |
+
episode_hist, solved, rgb_array = [], False, None
|
80 |
+
|
81 |
# Generate an episode following the current policy
|
82 |
for _ in range(max_steps):
|
83 |
+
rgb_array = self.env.render() if render else None
|
84 |
# Sample an action from the policy
|
85 |
action = self.choose_action(state)
|
86 |
# Take the action and observe the reward and next state
|
87 |
+
next_state, reward, done, truncated, _ = self.env.step(action)
|
88 |
# Keeping track of the trajectory
|
89 |
episode_hist.append((state, action, reward))
|
90 |
state = next_state
|
91 |
+
|
92 |
+
yield episode_hist, solved, rgb_array
|
93 |
+
|
94 |
# This is where the agent got to the goal.
|
95 |
# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
|
96 |
+
if done or truncated:
|
97 |
+
solved = True
|
98 |
break
|
99 |
|
100 |
+
rgb_array = self.env.render() if render else None
|
101 |
+
|
102 |
+
yield episode_hist, solved, rgb_array
|
103 |
+
|
104 |
+
def run_episode(self, max_steps=500, render=False, **kwargs):
|
105 |
+
# Run the generator until the end
|
106 |
+
episode_hist, solved, rgb_array = None, False, None
|
107 |
+
for episode_hist, solved, rgb_array in self.generate_episode(
|
108 |
+
max_steps, render, **kwargs
|
109 |
+
):
|
110 |
+
pass
|
111 |
+
return episode_hist, solved, rgb_array
|
112 |
|
113 |
def update_first_visit(self, episode_hist):
|
114 |
G = 0
|
|
|
172 |
self.wandb_log_img(episode=None)
|
173 |
|
174 |
for e in tqrange:
|
175 |
+
episode_hist, solved, _ = self.run_episode(**kwargs)
|
176 |
rewards = [x[2] for x in episode_hist]
|
177 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
178 |
train_running_success_rate = (
|
179 |
+
0.99 * train_running_success_rate + 0.01 * solved
|
180 |
)
|
181 |
update_func(episode_hist)
|
182 |
|
|
|
204 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
205 |
num_successes = 0
|
206 |
for e in range(n_test_episodes):
|
207 |
+
_, solved, _ = self.run_episode(**kwargs)
|
208 |
+
num_successes += solved
|
209 |
if verbose:
|
210 |
+
word = "reached" if solved else "did not reach"
|
211 |
+
emoji = "🏁" if solved else "🚫"
|
212 |
print(
|
213 |
f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
|
214 |
)
|
|
|
220 |
)
|
221 |
return success_rate
|
222 |
|
223 |
+
def save_policy(self, fname="policy.npy", save_dir=None):
|
224 |
+
if save_dir is not None:
|
225 |
+
os.makedirs(save_dir, exist_ok=True)
|
226 |
+
fname = os.path.join(save_dir, fname)
|
227 |
+
print(f"Saving policy to: {fname}")
|
228 |
np.save(fname, self.Pi)
|
229 |
|
230 |
def load_policy(self, fname="policy.npy"):
|
231 |
+
print(f"Loading policy from: {fname}")
|
232 |
self.Pi = np.load(fname)
|
233 |
|
234 |
+
def wandb_log_img(self, episode=None):
|
235 |
caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
|
236 |
wandb.log(
|
237 |
{
|
|
|
296 |
help="The type of update to use. (default: first_visit)",
|
297 |
)
|
298 |
|
299 |
+
parser.add_argument(
|
300 |
+
"--save_dir",
|
301 |
+
type=str,
|
302 |
+
default="policies",
|
303 |
+
help="The directory to save the policy to. (default: policies)",
|
304 |
+
)
|
305 |
+
|
306 |
parser.add_argument(
|
307 |
"--no_save",
|
308 |
action="store_true",
|
|
|
319 |
parser.add_argument(
|
320 |
"--epsilon",
|
321 |
type=float,
|
322 |
+
default=0.5,
|
323 |
help="The value for the epsilon-greedy policy to use. (default: 0.1)",
|
324 |
)
|
325 |
|
|
|
363 |
|
364 |
args = parser.parse_args()
|
365 |
|
366 |
+
agent = MonteCarloAgent(
|
367 |
args.env,
|
368 |
gamma=args.gamma,
|
369 |
epsilon=args.epsilon,
|
370 |
render_mode=args.render_mode,
|
371 |
)
|
372 |
|
373 |
+
run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
|
374 |
if args.wandb_run_name_suffix is not None:
|
375 |
run_name += f"+{args.wandb_run_name_suffix}"
|
376 |
|
|
|
386 |
config=dict(args._get_kwargs()),
|
387 |
)
|
388 |
|
389 |
+
agent.train(
|
390 |
n_train_episodes=args.n_train_episodes,
|
391 |
test_every=args.test_every,
|
392 |
n_test_episodes=args.n_test_episodes,
|
|
|
395 |
log_wandb=args.wandb_project is not None,
|
396 |
)
|
397 |
if not args.no_save:
|
398 |
+
agent.save_policy(
|
399 |
+
fname=f"{run_name}.npy",
|
400 |
+
save_dir=args.save_dir,
|
401 |
+
)
|
402 |
elif args.test is not None:
|
403 |
if not args.test.endswith(".npy"):
|
404 |
args.test += ".npy"
|
405 |
+
agent.load_policy(args.test)
|
406 |
+
agent.test(
|
407 |
n_test_episodes=args.n_test_episodes,
|
408 |
max_steps=args.max_steps,
|
409 |
)
|
demo.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import numpy as np
|
4 |
+
import gradio as gr
|
5 |
+
from MonteCarloAgent import MonteCarloAgent
|
6 |
+
|
7 |
+
|
8 |
+
# For the dropdown list of policies
|
9 |
+
policies_folder = "policies"
|
10 |
+
all_policies = [file for file in os.listdir(policies_folder) if file.endswith(".npy")]
|
11 |
+
|
12 |
+
# All supported agents
|
13 |
+
agent_map = {
|
14 |
+
"MonteCarloAgent": MonteCarloAgent,
|
15 |
+
# TODO: Add DP Agent
|
16 |
+
}
|
17 |
+
|
18 |
+
# Global variables for the agent and the render fps (to allow changing it on the fly)
|
19 |
+
agent = None
|
20 |
+
render_fps_val = 5
|
21 |
+
|
22 |
+
|
23 |
+
def load_policy(policy_fname):
|
24 |
+
print("Loading...")
|
25 |
+
print(f"- policy_fname: {policy_fname}")
|
26 |
+
global agent
|
27 |
+
policy_path = os.path.join(policies_folder, policy_fname)
|
28 |
+
props = policy_fname.split("_")
|
29 |
+
agent_type, env_name = props[0], props[1]
|
30 |
+
|
31 |
+
agent = agent_map[agent_type](env_name, render_mode="rgb_array")
|
32 |
+
agent.load_policy(policy_path)
|
33 |
+
|
34 |
+
return agent.env.spec.id, agent.__class__.__name__
|
35 |
+
|
36 |
+
|
37 |
+
def change_render_fps(x):
|
38 |
+
print("Changing render fps:", x)
|
39 |
+
global render_fps_val
|
40 |
+
render_fps_val = x
|
41 |
+
|
42 |
+
|
43 |
+
def run(n_test_episodes, max_steps, render_fps):
|
44 |
+
global agent, render_fps_val
|
45 |
+
render_fps_val = render_fps
|
46 |
+
print("Running...")
|
47 |
+
print(f"- n_test_episodes: {n_test_episodes}")
|
48 |
+
print(f"- max_steps: {max_steps}")
|
49 |
+
print(f"- render_fps: {render_fps_val}")
|
50 |
+
|
51 |
+
while agent is None:
|
52 |
+
print("Waiting for agent to be loaded...")
|
53 |
+
time.sleep(1)
|
54 |
+
yield None, None, None, None, None, None, None, "🚫 ERROR: Please load a policy first!"
|
55 |
+
|
56 |
+
rgb_array = np.random.random((25, 100, 3))
|
57 |
+
episode, step = 0, 0
|
58 |
+
state, action, reward = 0, 0, 0
|
59 |
+
episodes_solved = 0
|
60 |
+
|
61 |
+
def ep_str(episode):
|
62 |
+
return f"{episode + 1} / {n_test_episodes} ({(episode + 1) / n_test_episodes * 100:.2f}%)"
|
63 |
+
|
64 |
+
def step_str(step):
|
65 |
+
return f"{step + 1}"
|
66 |
+
|
67 |
+
for episode in range(n_test_episodes):
|
68 |
+
for step, (episode_hist, solved, rgb_array) in enumerate(
|
69 |
+
agent.generate_episode(max_steps=max_steps, render=True)
|
70 |
+
):
|
71 |
+
if solved:
|
72 |
+
episodes_solved += 1
|
73 |
+
state, action, reward = episode_hist[-1]
|
74 |
+
|
75 |
+
print(
|
76 |
+
f"Episode: {ep_str(episode)} - step: {step_str} - state: {state} - action: {action} - reward: {reward} (frame time: {1 / render_fps:.2f}s)"
|
77 |
+
)
|
78 |
+
|
79 |
+
time.sleep(1 / render_fps_val)
|
80 |
+
yield rgb_array, ep_str(episode), step_str(step), state, action, reward, ep_str(episodes_solved), "Running..."
|
81 |
+
|
82 |
+
yield rgb_array, ep_str(episode), step_str(step), state, action, reward, ep_str(episodes_solved), "Done!"
|
83 |
+
|
84 |
+
|
85 |
+
with gr.Blocks() as demo:
|
86 |
+
# TODO: Add title and description
|
87 |
+
|
88 |
+
with gr.Row():
|
89 |
+
with gr.Column():
|
90 |
+
input_policy = gr.components.Dropdown(
|
91 |
+
label="Policy", choices=all_policies, value=all_policies[0]
|
92 |
+
)
|
93 |
+
|
94 |
+
with gr.Row():
|
95 |
+
out_environment = gr.components.Textbox(label="Environment")
|
96 |
+
out_agent = gr.components.Textbox(label="Agent")
|
97 |
+
|
98 |
+
btn_load = gr.components.Button("📁 Load")
|
99 |
+
btn_load.click(
|
100 |
+
fn=load_policy,
|
101 |
+
inputs=[input_policy],
|
102 |
+
outputs=[out_environment, out_agent],
|
103 |
+
)
|
104 |
+
|
105 |
+
with gr.Column():
|
106 |
+
input_n_test_episodes = gr.components.Slider(
|
107 |
+
minimum=1,
|
108 |
+
maximum=100,
|
109 |
+
value=5,
|
110 |
+
label="Number of episodes",
|
111 |
+
)
|
112 |
+
input_max_steps = gr.components.Slider(
|
113 |
+
minimum=1,
|
114 |
+
maximum=500,
|
115 |
+
value=500,
|
116 |
+
label="Max steps per episode",
|
117 |
+
)
|
118 |
+
|
119 |
+
input_render_fps = gr.components.Slider(
|
120 |
+
minimum=1,
|
121 |
+
maximum=60,
|
122 |
+
value=5,
|
123 |
+
label="Render FPS",
|
124 |
+
)
|
125 |
+
input_render_fps.change(change_render_fps, inputs=[input_render_fps])
|
126 |
+
|
127 |
+
btn_run = gr.components.Button("▶️ Run")
|
128 |
+
|
129 |
+
out_msg = gr.components.Textbox(label="Message")
|
130 |
+
|
131 |
+
with gr.Row():
|
132 |
+
out_episode = gr.components.Textbox(label="Current Episode")
|
133 |
+
out_step = gr.components.Textbox(label="Current Step")
|
134 |
+
out_state = gr.components.Textbox(label="Current State")
|
135 |
+
out_action = gr.components.Textbox(label="Chosen Action")
|
136 |
+
out_reward = gr.components.Textbox(label="Reward Received")
|
137 |
+
out_eps_solved = gr.components.Textbox(label="Episodes Solved")
|
138 |
+
|
139 |
+
out_image = gr.components.Image(label="Environment", type="numpy", image_mode="RGB")
|
140 |
+
|
141 |
+
btn_run.click(
|
142 |
+
fn=run,
|
143 |
+
inputs=[
|
144 |
+
input_n_test_episodes,
|
145 |
+
input_max_steps,
|
146 |
+
input_render_fps,
|
147 |
+
],
|
148 |
+
outputs=[
|
149 |
+
out_image,
|
150 |
+
out_episode,
|
151 |
+
out_step,
|
152 |
+
out_state,
|
153 |
+
out_action,
|
154 |
+
out_reward,
|
155 |
+
out_eps_solved,
|
156 |
+
out_msg,
|
157 |
+
],
|
158 |
+
)
|
159 |
+
|
160 |
+
|
161 |
+
demo.queue(concurrency_count=2)
|
162 |
+
demo.launch()
|
policies/MonteCarloAgent_CliffWalking-v0_e2000_s500_g0.99_e0.5.npy
ADDED
Binary file (1.66 kB). View file
|
|
run_tests.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import multiprocessing
|
3 |
import random
|
4 |
|
5 |
-
num_tests =
|
6 |
|
7 |
update_types = ["first_visit", "every_visit"]
|
8 |
vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]
|
|
|
2 |
import multiprocessing
|
3 |
import random
|
4 |
|
5 |
+
num_tests = 5
|
6 |
|
7 |
update_types = ["first_visit", "every_visit"]
|
8 |
vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]
|