lharri73 commited on
Commit
f929afb
·
1 Parent(s): b8a5bf6

update DPAgent

Browse files
Files changed (1) hide show
  1. DPAgent.py +25 -25
DPAgent.py CHANGED
@@ -12,19 +12,10 @@ class DP:
12
  self.theta = theta
13
  self.V = np.zeros(self.env.observation_space.n)
14
  self.epsilon = 0
 
15
 
16
- def policy(self, state, return_value=False):
17
- Q = np.zeros(self.env.action_space.n)
18
- for action in range(self.env.action_space.n):
19
- expected_value = 0
20
- for probability, next_state, reward, done in self.env.P[state][action]:
21
- if state == self.env.observation_space.n-1: reward = 1
22
- expected_value += probability * (reward + self.gamma * self.V[next_state])
23
- Q[action] = expected_value
24
- if return_value:
25
- return np.argmax(Q), np.max(Q)
26
- else:
27
- return Q
28
 
29
  def train(self):
30
  i = 0
@@ -32,27 +23,36 @@ class DP:
32
  delta = 0
33
  V_prev = np.copy(self.V)
34
  for state in range(self.env.observation_space.n):
35
- action, value = self.policy(state, return_value=True)
 
 
 
 
 
 
 
 
 
 
36
  self.V[state] = value
37
  delta = max(delta, abs(V_prev[state] - self.V[state]))
38
  if delta < self.theta:
39
  break
40
  i += 1
 
41
  print(f"Iteration {i}: delta={delta}")
42
  # break
43
 
44
- policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
45
- return self.V, policy
46
-
47
- def save_policy(self, pth):
48
- np.save(pth, self.V)
49
-
50
- def load_policy(self, pth):
51
- """
52
- not really loading the 'policy', but the state-value function but for
53
- interface's sake, here we are.
54
- """
55
- self.V = np.load(pth)
56
 
57
  def generate_episode(self, max_steps, render=False, **kwargs):
58
  state, _ = self.env.reset()
 
12
  self.theta = theta
13
  self.V = np.zeros(self.env.observation_space.n)
14
  self.epsilon = 0
15
+ self.Pi = None
16
 
17
+ def policy(self, state):
18
+ return self.Pi[state]
 
 
 
 
 
 
 
 
 
 
19
 
20
  def train(self):
21
  i = 0
 
23
  delta = 0
24
  V_prev = np.copy(self.V)
25
  for state in range(self.env.observation_space.n):
26
+ # calculate the action-value for each possible action
27
+ Q = np.zeros(self.env.action_space.n)
28
+ for action in range(self.env.action_space.n):
29
+ expected_value = 0
30
+ for probability, next_state, reward, done in self.env.P[state][action]:
31
+ if state == self.env.observation_space.n-1: reward = 1
32
+ expected_value += probability * (reward + self.gamma * self.V[next_state])
33
+ Q[action] = expected_value
34
+ action, value = np.argmax(Q), np.max(Q)
35
+
36
+ # update the state-value function
37
  self.V[state] = value
38
  delta = max(delta, abs(V_prev[state] - self.V[state]))
39
  if delta < self.theta:
40
  break
41
  i += 1
42
+ # self.test()
43
  print(f"Iteration {i}: delta={delta}")
44
  # break
45
 
46
+ # policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
47
+ self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
48
+ for s in range(self.env.observation_space.n):
49
+ for a in range(self.env.action_space.n):
50
+ expected_value = 0
51
+ for probability, next_state, reward, done in self.env.P[s][a]:
52
+ # if state == self.env.observation_space.n-1: reward = 1
53
+ expected_value += probability * (reward + self.gamma * self.V[next_state])
54
+ self.Pi[s,a] = expected_value
55
+ # return self.V, self.Pi
 
 
56
 
57
  def generate_episode(self, max_steps, render=False, **kwargs):
58
  state, _ = self.env.reset()