import warnings import gymnasium as gym import numpy as np from gymnasium.envs.toy_text.frozen_lake import generate_random_map from matplotlib import pyplot as plt from PIL import Image from tqdm import trange from AgentBase import AgentBase class DPAgent(AgentBase): def __init__(self, /, **kwargs): super().__init__(run_name=self.__class__.__name__, **kwargs) self.theta = kwargs.get("theta", 1e-10) print(self.theta) self.V = np.zeros(self.env.observation_space.n) self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n) if self.gamma >= 1.0: warnings.warn( "DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning ) def policy(self, state): return self.Pi[state] def train(self, *args, **kwargs): success_rate = [] i = 0 print(self.gamma) while True: delta = 0 V_prev = np.copy(self.V) for state in range(self.env.observation_space.n): # calculate the action-value for each possible action Q = np.zeros(self.env.action_space.n) for action in range(self.env.action_space.n): expected_value = 0 for probability, next_state, reward, done in self.env.P[state][ action ]: if ( self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n - 1 ): reward = 1 expected_value += probability * ( reward + self.gamma * self.V[next_state] ) Q[action] = expected_value action, value = np.argmax(Q), np.max(Q) # update the state-value function self.V[state] = value delta = max(delta, abs(V_prev[state] - self.V[state])) self.make_pi() suc = self.test(verbose=False, greedy=True) success_rate.append(suc) if delta < self.theta and self.theta < 1: print(f"breaking at {delta}, {self.theta}") break elif i > self.theta and self.theta > 1: print(f"breaking at {i}, {self.theta}") break i += 1 print(f"Iteration {i}: delta={delta}") # self.write_v(0) return success_rate def write_v(self, i): v_cop = np.copy(self.V).reshape((12, 4)) print(v_cop) v_cop -= np.min(v_cop) v_cop /= np.max(v_cop) print(np.min(v_cop), np.max(v_cop)) img = Image.fromarray(np.uint8(v_cop * 255), "L") img = img.resize( (v_cop.shape[0] * 100, v_cop.shape[1] * 100), resample=Image.Resampling.NEAREST, ) img.save(f"imgs/{i}.png") def make_pi(self): self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n)) for s in range(self.env.observation_space.n): for a in range(self.env.action_space.n): expected_value = 0 for probability, next_state, reward, done in self.env.P[s][a]: if ( self.env_name == "CliffWalking-v0" and s == self.env.observation_space.n - 1 ): reward = 1 expected_value += probability * ( reward + self.gamma * self.V[next_state] ) self.Pi[s, a] = expected_value idxs = np.argmax(self.Pi, axis=1) self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n)) self.Pi[np.arange(self.env.observation_space.n), idxs] = 1 if __name__ == "__main__": env = gym.make( "FrozenLake-v1", render_mode="ansi", desc=generate_random_map(8, seed=24), is_slippery=False, ) dp = DPAgent(env="FrozenLake-v1", gamma=0.99) dp.env = env dp.env_name = "FrozenLake-v1" dp.V = np.zeros(dp.env.observation_space.n) dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n) dp.n_states, dp.n_actions = ( dp.env.observation_space.n, dp.env.action_space.n, ) dp.train() print(dp.test()) state, _ = env.reset() done = False while not done: action = dp.choose_action(dp.Pi, state) state, reward, done, _, _ = env.step(action) s = env.render() print(s) plt.savefig(f"imgs/{0}.png")