Spaces:
Sleeping
Sleeping
File size: 3,936 Bytes
93d8108 3e2038a 93d8108 786f010 3e2038a a7331e4 93d8108 a7331e4 93d8108 786f010 93d8108 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
from tqdm import trange
class DPAgent:
def __init__(self, env_name, gamma=0.9, theta=1e-10, **kwargs):
self.env = gym.make(env_name, **kwargs)
self.gamma = gamma
self.theta = theta
self.V = np.zeros(self.env.observation_space.n)
self.epsilon = 0
def policy(self, state, return_value=False):
Q = np.zeros(self.env.action_space.n)
for action in range(self.env.action_space.n):
expected_value = 0
for probability, next_state, reward, done in self.env.P[state][action]:
if state == self.env.observation_space.n-1: reward = 1
expected_value += probability * (reward + self.gamma * self.V[next_state])
Q[action] = expected_value
if return_value:
return np.argmax(Q), np.max(Q)
else:
return Q
def train(self):
i = 0
while True:
delta = 0
V_prev = np.copy(self.V)
for state in range(self.env.observation_space.n):
action, value = self.policy(state, return_value=True)
self.V[state] = value
delta = max(delta, abs(V_prev[state] - self.V[state]))
if delta < self.theta:
break
i += 1
print(f"Iteration {i}: delta={delta}")
# break
policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
return self.V, policy
def save_policy(self, pth):
np.save(pth, self.V)
def load_policy(self, pth):
"""
not really loading the 'policy', but the state-value function but for
interface's sake, here we are.
"""
self.V = np.load(pth)
def generate_episode(self, max_steps, render=False, **kwargs):
state, _ = self.env.reset()
episode_hist, solved, rgb_array = [], False, None
# Generate an episode following the current policy
for _ in range(max_steps):
rgb_array = self.env.render() if render else None
# Sample an action from the policy
action = self.policy(state)
maction = np.argmax(action)
# Take the action and observe the reward and next state
next_state, reward, done, truncated, _ = self.env.step(maction)
# Keeping track of the trajectory
episode_hist.append((state, maction, reward))
state = next_state
yield episode_hist, solved, rgb_array
# This is where the agent got to the goal.
# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
if done or truncated:
solved = True
break
rgb_array = self.env.render() if render else None
yield episode_hist, solved, rgb_array
if __name__ == "__main__":
# env = gym.make('FrozenLake-v1', render_mode='human')
dp = DPAgent("FrozenLake-v1", is_slippery=False, desc=[
"SFFFFFFF",
"FFFFFFFH",
"FFFHFFFF",
"FFFFFHFF",
"FFFHFFFF",
"FHHFFFHF",
"FHFFHFHF",
"FFFHFFFG",
])
dp.train()
dp.save_policy('dp_policy.npy')
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
"SFFFFFFF",
"FFFFFFFH",
"FFFHFFFF",
"FFFFFHFF",
"FFFHFFFF",
"FHHFFFHF",
"FHFFHFHF",
"FFFHFFFG",
])
state, _ = env.reset()
done = False
while not done:
action = dp.policy(state)
action = np.argmax(action)
state, reward, done, _, _ = env.step(action)
env.render()
# plt.savefig(f"imgs/{0}.png")
|