CS581-Algos-Demo / DPAgent.py
lharri73's picture
retrain frozenlake
17d4626
raw
history blame
3.47 kB
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
from tqdm import trange
from Shared import Shared
import warnings
class DPAgent(Shared):
def __init__(self,/,**kwargs):
super().__init__(**kwargs)
self.theta = kwargs.get('theta', 1e-10)
print(self.theta)
self.V = np.zeros(self.env.observation_space.n)
self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
if self.gamma >= 1.0:
warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
def policy(self, state):
return self.Pi[state]
def train(self, *args, **kwargs):
i = 0
print(self.gamma)
while True:
delta = 0
V_prev = np.copy(self.V)
for state in range(self.env.observation_space.n):
# calculate the action-value for each possible action
Q = np.zeros(self.env.action_space.n)
for action in range(self.env.action_space.n):
expected_value = 0
for probability, next_state, reward, done in self.env.P[state][action]:
# if state == self.env.observation_space.n-1: reward = 1
expected_value += probability * (reward + self.gamma * self.V[next_state])
Q[action] = expected_value
action, value = np.argmax(Q), np.max(Q)
# update the state-value function
self.V[state] = value
delta = max(delta, abs(V_prev[state] - self.V[state]))
if delta < self.theta:
break
i += 1
# if i % 100 == 0 and i != 0:
# self.test()
print(f"Iteration {i}: delta={delta}")
# break
# policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
for s in range(self.env.observation_space.n):
for a in range(self.env.action_space.n):
expected_value = 0
for probability, next_state, reward, done in self.env.P[s][a]:
# if state == self.env.observation_space.n-1: reward = 1
expected_value += probability * (reward + self.gamma * self.V[next_state])
self.Pi[s,a] = expected_value
idxs = np.argmax(self.Pi, axis=1)
print(idxs)
self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
# print(self.Pi)
# return self.V, self.Pi
if __name__ == "__main__":
# env = gym.make('FrozenLake-v1', render_mode='human')
dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
dp.train()
dp.save_policy('dp_policy.npy')
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
"SFFFFFFF",
"FFFFFFFH",
"FFFHFFFF",
"FFFFFHFF",
"FFFHFFFF",
"FHHFFFHF",
"FHFFHFHF",
"FFFHFFFG",
])
state, _ = env.reset()
done = False
while not done:
action = dp.choose_action(state)
state, reward, done, _, _ = env.step(action)
env.render()
# plt.savefig(f"imgs/{0}.png")