lharri73 commited on
Commit
e282b5d
·
1 Parent(s): e3af403

removed reward modification

Browse files
Files changed (1) hide show
  1. DPAgent.py +4 -3
DPAgent.py CHANGED
@@ -16,6 +16,8 @@ class DPAgent(Shared):
16
  self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
17
  if self.gamma >= 1.0:
18
  warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
 
 
19
 
20
  def policy(self, state):
21
  return self.Pi[state]
@@ -32,7 +34,7 @@ class DPAgent(Shared):
32
  for action in range(self.env.action_space.n):
33
  expected_value = 0
34
  for probability, next_state, reward, done in self.env.P[state][action]:
35
- if state == self.env.observation_space.n-1: reward = 1
36
  expected_value += probability * (reward + self.gamma * self.V[next_state])
37
  Q[action] = expected_value
38
  action, value = np.argmax(Q), np.max(Q)
@@ -60,14 +62,13 @@ class DPAgent(Shared):
60
  print(idxs)
61
  self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
62
  self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
63
-
64
  # print(self.Pi)
65
  # return self.V, self.Pi
66
 
67
 
68
  if __name__ == "__main__":
69
  # env = gym.make('FrozenLake-v1', render_mode='human')
70
- dp = DPAgent(env_name="FrozenLake-v1")
71
  dp.train()
72
  dp.save_policy('dp_policy.npy')
73
  env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
 
16
  self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
17
  if self.gamma >= 1.0:
18
  warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
19
+ print(self.env)
20
+ exit(1)
21
 
22
  def policy(self, state):
23
  return self.Pi[state]
 
34
  for action in range(self.env.action_space.n):
35
  expected_value = 0
36
  for probability, next_state, reward, done in self.env.P[state][action]:
37
+ # if state == self.env.observation_space.n-1: reward = 1
38
  expected_value += probability * (reward + self.gamma * self.V[next_state])
39
  Q[action] = expected_value
40
  action, value = np.argmax(Q), np.max(Q)
 
62
  print(idxs)
63
  self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
64
  self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
 
65
  # print(self.Pi)
66
  # return self.V, self.Pi
67
 
68
 
69
  if __name__ == "__main__":
70
  # env = gym.make('FrozenLake-v1', render_mode='human')
71
+ dp = DPAgent(env_name="FrozenLake-v1", gamma=0.99)
72
  dp.train()
73
  dp.save_policy('dp_policy.npy')
74
  env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[