Spaces:
Sleeping
Sleeping
removed reward modification
Browse files- DPAgent.py +4 -3
DPAgent.py
CHANGED
@@ -16,6 +16,8 @@ class DPAgent(Shared):
|
|
16 |
self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
|
17 |
if self.gamma >= 1.0:
|
18 |
warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
|
|
|
|
|
19 |
|
20 |
def policy(self, state):
|
21 |
return self.Pi[state]
|
@@ -32,7 +34,7 @@ class DPAgent(Shared):
|
|
32 |
for action in range(self.env.action_space.n):
|
33 |
expected_value = 0
|
34 |
for probability, next_state, reward, done in self.env.P[state][action]:
|
35 |
-
if state == self.env.observation_space.n-1: reward = 1
|
36 |
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
37 |
Q[action] = expected_value
|
38 |
action, value = np.argmax(Q), np.max(Q)
|
@@ -60,14 +62,13 @@ class DPAgent(Shared):
|
|
60 |
print(idxs)
|
61 |
self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
|
62 |
self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
|
63 |
-
|
64 |
# print(self.Pi)
|
65 |
# return self.V, self.Pi
|
66 |
|
67 |
|
68 |
if __name__ == "__main__":
|
69 |
# env = gym.make('FrozenLake-v1', render_mode='human')
|
70 |
-
dp = DPAgent(env_name="FrozenLake-v1")
|
71 |
dp.train()
|
72 |
dp.save_policy('dp_policy.npy')
|
73 |
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
|
|
|
16 |
self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
|
17 |
if self.gamma >= 1.0:
|
18 |
warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
|
19 |
+
print(self.env)
|
20 |
+
exit(1)
|
21 |
|
22 |
def policy(self, state):
|
23 |
return self.Pi[state]
|
|
|
34 |
for action in range(self.env.action_space.n):
|
35 |
expected_value = 0
|
36 |
for probability, next_state, reward, done in self.env.P[state][action]:
|
37 |
+
# if state == self.env.observation_space.n-1: reward = 1
|
38 |
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
39 |
Q[action] = expected_value
|
40 |
action, value = np.argmax(Q), np.max(Q)
|
|
|
62 |
print(idxs)
|
63 |
self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
|
64 |
self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
|
|
|
65 |
# print(self.Pi)
|
66 |
# return self.V, self.Pi
|
67 |
|
68 |
|
69 |
if __name__ == "__main__":
|
70 |
# env = gym.make('FrozenLake-v1', render_mode='human')
|
71 |
+
dp = DPAgent(env_name="FrozenLake-v1", gamma=0.99)
|
72 |
dp.train()
|
73 |
dp.save_policy('dp_policy.npy')
|
74 |
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
|