Spaces:
Sleeping
Sleeping
File size: 4,719 Bytes
e0c3c75 93d8108 e0c3c75 93d8108 e0c3c75 46b0409 93d8108 46b0409 cf8b7c4 8ae24a2 93d8108 8ae24a2 cf8b7c4 93d8108 f929afb 93d8108 8ae24a2 e0c3c75 93d8108 8ae24a2 93d8108 f929afb cf8b7c4 46b0409 3266489 cf8b7c4 f929afb 93d8108 e0c3c75 93d8108 46b0409 e0c3c75 f929afb 46b0409 e0c3c75 46b0409 cf8b7c4 8ae24a2 cf8b7c4 93d8108 cf8b7c4 e0c3c75 cf8b7c4 e0c3c75 93d8108 6a48762 93d8108 e0c3c75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import warnings
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
from PIL import Image
from tqdm import trange
from AgentBase import AgentBase
class DPAgent(AgentBase):
def __init__(self, /, **kwargs):
super().__init__(run_name=self.__class__.__name__, **kwargs)
self.theta = kwargs.get("theta", 1e-10)
print(self.theta)
self.V = np.zeros(self.env.observation_space.n)
self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
if self.gamma >= 1.0:
warnings.warn(
"DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning
)
def policy(self, state):
return self.Pi[state]
def train(self, *args, **kwargs):
success_rate = []
i = 0
print(self.gamma)
while True:
delta = 0
V_prev = np.copy(self.V)
for state in range(self.env.observation_space.n):
# calculate the action-value for each possible action
Q = np.zeros(self.env.action_space.n)
for action in range(self.env.action_space.n):
expected_value = 0
for probability, next_state, reward, done in self.env.P[state][
action
]:
if (
self.env_name == "CliffWalking-v0"
and state == self.env.observation_space.n - 1
):
reward = 1
expected_value += probability * (
reward + self.gamma * self.V[next_state]
)
Q[action] = expected_value
action, value = np.argmax(Q), np.max(Q)
# update the state-value function
self.V[state] = value
delta = max(delta, abs(V_prev[state] - self.V[state]))
self.make_pi()
suc = self.test(verbose=False, greedy=True)
success_rate.append(suc)
if delta < self.theta and self.theta < 1:
print(f"breaking at {delta}, {self.theta}")
break
elif i > self.theta and self.theta > 1:
print(f"breaking at {i}, {self.theta}")
break
i += 1
print(f"Iteration {i}: delta={delta}")
# self.write_v(0)
return success_rate
def write_v(self, i):
v_cop = np.copy(self.V).reshape((12, 4))
print(v_cop)
v_cop -= np.min(v_cop)
v_cop /= np.max(v_cop)
print(np.min(v_cop), np.max(v_cop))
img = Image.fromarray(np.uint8(v_cop * 255), "L")
img = img.resize(
(v_cop.shape[0] * 100, v_cop.shape[1] * 100),
resample=Image.Resampling.NEAREST,
)
img.save(f"imgs/{i}.png")
def make_pi(self):
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
for s in range(self.env.observation_space.n):
for a in range(self.env.action_space.n):
expected_value = 0
for probability, next_state, reward, done in self.env.P[s][a]:
if (
self.env_name == "CliffWalking-v0"
and s == self.env.observation_space.n - 1
):
reward = 1
expected_value += probability * (
reward + self.gamma * self.V[next_state]
)
self.Pi[s, a] = expected_value
idxs = np.argmax(self.Pi, axis=1)
self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
if __name__ == "__main__":
env = gym.make(
"FrozenLake-v1",
render_mode="ansi",
desc=generate_random_map(8, seed=24),
is_slippery=False,
)
dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
dp.env = env
dp.env_name = "FrozenLake-v1"
dp.V = np.zeros(dp.env.observation_space.n)
dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
dp.n_states, dp.n_actions = (
dp.env.observation_space.n,
dp.env.action_space.n,
)
dp.train()
print(dp.test())
state, _ = env.reset()
done = False
while not done:
action = dp.choose_action(dp.Pi, state)
state, reward, done, _, _ = env.step(action)
s = env.render()
print(s)
plt.savefig(f"imgs/{0}.png")
|