Spaces:
Sleeping
Sleeping
import gymnasium as gym | |
import numpy as np | |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map | |
from matplotlib import pyplot as plt | |
from tqdm import trange | |
from Shared import Shared | |
import warnings | |
class DPAgent(Shared): | |
def __init__(self,/,**kwargs): | |
super().__init__(**kwargs) | |
self.theta = kwargs.get('theta', 1e-10) | |
print(self.theta) | |
self.V = np.zeros(self.env.observation_space.n) | |
self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n) | |
if self.gamma >= 1.0: | |
warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning) | |
def policy(self, state): | |
return self.Pi[state] | |
def train(self, *args, **kwargs): | |
i = 0 | |
print(self.gamma) | |
while True: | |
delta = 0 | |
V_prev = np.copy(self.V) | |
for state in range(self.env.observation_space.n): | |
# calculate the action-value for each possible action | |
Q = np.zeros(self.env.action_space.n) | |
for action in range(self.env.action_space.n): | |
expected_value = 0 | |
for probability, next_state, reward, done in self.env.P[state][action]: | |
# if state == self.env.observation_space.n-1: reward = 1 | |
expected_value += probability * (reward + self.gamma * self.V[next_state]) | |
Q[action] = expected_value | |
action, value = np.argmax(Q), np.max(Q) | |
# update the state-value function | |
self.V[state] = value | |
delta = max(delta, abs(V_prev[state] - self.V[state])) | |
if delta < self.theta: | |
break | |
i += 1 | |
# if i % 100 == 0 and i != 0: | |
# self.test() | |
print(f"Iteration {i}: delta={delta}") | |
# break | |
# policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)] | |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n)) | |
for s in range(self.env.observation_space.n): | |
for a in range(self.env.action_space.n): | |
expected_value = 0 | |
for probability, next_state, reward, done in self.env.P[s][a]: | |
# if state == self.env.observation_space.n-1: reward = 1 | |
expected_value += probability * (reward + self.gamma * self.V[next_state]) | |
self.Pi[s,a] = expected_value | |
idxs = np.argmax(self.Pi, axis=1) | |
print(idxs) | |
self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n)) | |
self.Pi[np.arange(self.env.observation_space.n),idxs] = 1 | |
# print(self.Pi) | |
# return self.V, self.Pi | |
if __name__ == "__main__": | |
# env = gym.make('FrozenLake-v1', render_mode='human') | |
dp = DPAgent(env="FrozenLake-v1", gamma=0.99) | |
dp.train() | |
dp.save_policy('dp_policy.npy') | |
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[ | |
"SFFFFFFF", | |
"FFFFFFFH", | |
"FFFHFFFF", | |
"FFFFFHFF", | |
"FFFHFFFF", | |
"FHHFFFHF", | |
"FHFFHFHF", | |
"FFFHFFFG", | |
]) | |
state, _ = env.reset() | |
done = False | |
while not done: | |
action = dp.choose_action(state) | |
state, reward, done, _, _ = env.step(action) | |
env.render() | |
# plt.savefig(f"imgs/{0}.png") | |