Spaces:
Sleeping
Sleeping
File size: 7,845 Bytes
b8a5bf6 99ac186 b8a5bf6 668f525 46b0409 b8a5bf6 668f525 17d4626 b8a5bf6 6ee82fe b8a5bf6 17d4626 6ee82fe 17d4626 6ee82fe 668f525 b8a5bf6 6ee82fe 668f525 b8a5bf6 99ac186 6ee82fe b8a5bf6 6a48762 ec9cd4e 6a48762 ec9cd4e 6a48762 ec9cd4e 6a48762 668f525 b8a5bf6 ec9cd4e 668f525 6a48762 b8a5bf6 ec9cd4e b8a5bf6 668f525 b8a5bf6 668f525 7d3766a b8a5bf6 0a58f79 ec8233c b8a5bf6 ec8233c b8a5bf6 ec8233c 6a48762 ec8233c b8a5bf6 ec8233c b8a5bf6 a33a97c b8a5bf6 ec8233c b8a5bf6 ec8233c b8a5bf6 7d3766a b8a5bf6 668f525 6a48762 668f525 b8a5bf6 6a48762 b8a5bf6 668f525 6ee82fe b8a5bf6 6ee82fe 84b1569 6ee82fe b8a5bf6 6ee82fe b8a5bf6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import os
import numpy as np
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
class AgentBase:
def __init__(
self,
/,
env="CliffWalking-v0",
gamma=0.99,
epsilon=0.1,
run_name=None,
seed=None,
**kwargs,
):
print("=" * 80)
print(f"# Init Agent - {env}")
self.env_name = env
self.epsilon, self.gamma = float(epsilon), float(gamma)
print(f"- epsilon: {self.epsilon}")
print(f"- gamma: {self.gamma}")
self.epsilon_override = None
self.run_name = f"{run_name}_" if run_name is not None else ""
self.run_name += f"{env}_gamma:{gamma}_epsilon:{epsilon}"
print(f"- run_name: {run_name}")
self.env_kwargs = {k: v for k, v in kwargs.items() if k in ["render_mode"]}
if self.env_name == "FrozenLake-v1":
# Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
# self.env_kwargs["map_name"] = "8x8"
# self.env_kwargs["desc"] = [
# "SFFFFFFF",
# "FFFFFFFH",
# "FFFHFFFF",
# "FFFFFHFF",
# "FFFHFFFF",
# "FHHFFFHF",
# "FHFFHFHF",
# "FFFHFFFG",
# ]
size = int(kwargs.get("size", 8))
print(f"- size: {size}")
self.run_name += f"_size:{size}"
seed = int(seed) if seed is not None else np.random.randint(0, 100000)
print(f"- seed: {seed}")
self.run_name += f"_seed:{seed}"
self.env_kwargs["desc"] = generate_random_map(size=size, seed=seed)
self.env_kwargs["is_slippery"] = False
self.env = gym.make(self.env_name, **self.env_kwargs)
self.n_states, self.n_actions = (
self.env.observation_space.n,
self.env.action_space.n,
)
print(f"- n_states: {self.n_states}")
print(f"- n_actions: {self.n_actions}")
def choose_action(self, policy, state, greedy=False, **kwargs):
"""
Sample an action from the policy.
Also allows the ability to override the epsilon value (for the purpose of the demo)
:param state: The current state
:param policy: The policy to sample from. Must be of shape (n_states, n_actions)
:param greedy: If True, always return the greedy action (argmax of the policy at the current state)
:return: The sampled action
"""
assert policy.shape == (self.n_states, self.n_actions), (
f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
f"Got {policy.shape}."
)
# If greedy is True, always return the greedy action
greedy_action = np.argmax(policy[state])
if greedy or self.epsilon_override == 0.0:
return greedy_action
# Otherwise, sample an action from the soft policy (epsilon-greedy)
if self.epsilon_override is None:
return np.random.choice(self.n_actions, p=policy[state])
# If we ever want to manually override the epsilon value, it happens here
return np.random.choice(
[greedy_action, np.random.randint(self.n_actions)],
p=[1.0 - self.epsilon_override, self.epsilon_override],
)
def generate_episode(self, policy, max_steps=None, render=False, **kwargs):
if max_steps is None:
# If max_steps is not specified, we use a rough estimate of
# the maximum number of steps it should take to solve the environment
max_steps = self.n_states * self.n_actions
state, _ = self.env.reset()
episode_hist, solved, done = [], False, False
rgb_array = self.env.render() if render else None
i = 0
# Generate an episode following the current policy
while i < max_steps and not solved and not done:
# Sample the next action from the policy
action = self.choose_action(policy, state, **kwargs)
# Keeping track of the trajectory
episode_hist.append((state, action, None))
# Take the action and observe the reward and next state
next_state, reward, done, _, _ = self.env.step(action)
if self.env_name == "FrozenLake-v1":
if done:
reward = 100 if reward == 1 else -10
else:
reward = -1
# Keeping track of the trajectory
episode_hist[-1] = (state, action, reward)
# Generate the output at intermediate steps for the demo
yield episode_hist, solved, rgb_array
# Render the environment if needed
rgb_array = self.env.render() if render else None
# For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
if done and self.env_name in ["CliffWalking-v0", "Taxi-v3"]:
solved = True
# For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
# We consider the episode solved when the agent reaches the goal
if done and self.env_name == "FrozenLake-v1":
if next_state == self.env.nrow * self.env.ncol - 1:
solved = True
else:
# Instead of terminating the episode when the agent moves into a hole, we reset the environment
# This is to keep consistent with the other environments
done, solved = False, False
next_state, _ = self.env.reset()
state = next_state
i += 1
rgb_array = self.env.render() if render else None
yield episode_hist, solved, rgb_array
def run_episode(self, policy, max_steps=None, render=False, **kwargs):
# Run the generator until the end
episode_hist, solved, rgb_array = list(
self.generate_episode(policy, max_steps, render, **kwargs)
)[-1]
return episode_hist, solved, rgb_array
def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
if verbose:
print(f"Testing agent for {n_test_episodes} episodes...")
num_successes = 0
for e in range(n_test_episodes):
_, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
num_successes += solved
if verbose:
word = "reached" if solved else "did not reach"
emoji = "🏁" if solved else "🚫"
print(
f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
)
success_rate = num_successes / n_test_episodes
if verbose:
print(
f"Agent reached the goal in {num_successes}/{n_test_episodes} episodes ({success_rate * 100:.2f}%)"
)
return success_rate
def save_policy(self, fname=None, save_dir=None):
if fname is None and self.run_name is None:
raise ValueError("Must provide a filename or a run name to save the policy")
elif fname is None:
fname = self.run_name
if save_dir is not None:
os.makedirs(save_dir, exist_ok=True)
fname = os.path.join(save_dir, fname)
if not fname.endswith(".npy"):
fname += ".npy"
print(f"Saving policy to: '{fname}'")
np.save(fname, self.Pi)
def load_policy(self, fname="policy.npy"):
print(f"Loading policy from: '{fname}'")
if not fname.endswith(".npy"):
fname += ".npy"
self.Pi = np.load(fname)
|