import gymnasium as gym import numpy as np from gymnasium.envs.toy_text.frozen_lake import generate_random_map from matplotlib import pyplot as plt from tqdm import trange from AgentBase import AgentBase import warnings class DPAgent(AgentBase): def __init__(self, /, **kwargs): super().__init__(run_name=self.__class__.__name__, **kwargs) self.theta = kwargs.get("theta", 1e-10) print(self.theta) self.V = np.zeros(self.env.observation_space.n) self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n) if self.gamma >= 1.0: warnings.warn( "DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning ) def policy(self, state): return self.Pi[state] def train(self, *args, **kwargs): i = 0 print(self.gamma) while True: delta = 0 V_prev = np.copy(self.V) for state in range(self.env.observation_space.n): # calculate the action-value for each possible action Q = np.zeros(self.env.action_space.n) for action in range(self.env.action_space.n): expected_value = 0 for probability, next_state, reward, done in self.env.P[state][ action ]: if ( self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n - 1 ): reward = 1 expected_value += probability * ( reward + self.gamma * self.V[next_state] ) Q[action] = expected_value action, value = np.argmax(Q), np.max(Q) # update the state-value function self.V[state] = value delta = max(delta, abs(V_prev[state] - self.V[state])) if delta < self.theta: break i += 1 # if i % 5 == 0 and i != 0: # self.test(verbose=False) print(f"Iteration {i}: delta={delta}") self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n)) for s in range(self.env.observation_space.n): for a in range(self.env.action_space.n): expected_value = 0 for probability, next_state, reward, done in self.env.P[s][a]: if ( self.env_name == "CliffWalking-v0" and state == self.env.observation_space.n - 1 ): reward = 1 expected_value += probability * ( reward + self.gamma * self.V[next_state] ) self.Pi[s, a] = expected_value idxs = np.argmax(self.Pi, axis=1) print(idxs) self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n)) self.Pi[np.arange(self.env.observation_space.n), idxs] = 1 # print(self.Pi) # return self.V, self.Pi if __name__ == "__main__": # env = gym.make('FrozenLake-v1', render_mode='human') dp = DPAgent(env="FrozenLake-v1", gamma=0.99) dp.train() dp.save_policy("dp_policy.npy") env = gym.make( "FrozenLake-v1", render_mode="human", is_slippery=False, desc=[ "SFFFFFFF", "FFFFFFFH", "FFFHFFFF", "FFFFFHFF", "FFFHFFFF", "FHHFFFHF", "FHFFHFHF", "FFFHFFFG", ], ) state, _ = env.reset() done = False while not done: action = dp.choose_action(state) state, reward, done, _, _ = env.step(action) env.render() # plt.savefig(f"imgs/{0}.png")