Spaces:
Sleeping
Sleeping
update DPAgent
Browse files- DPAgent.py +25 -25
DPAgent.py
CHANGED
@@ -12,19 +12,10 @@ class DP:
|
|
12 |
self.theta = theta
|
13 |
self.V = np.zeros(self.env.observation_space.n)
|
14 |
self.epsilon = 0
|
|
|
15 |
|
16 |
-
def policy(self, state
|
17 |
-
|
18 |
-
for action in range(self.env.action_space.n):
|
19 |
-
expected_value = 0
|
20 |
-
for probability, next_state, reward, done in self.env.P[state][action]:
|
21 |
-
if state == self.env.observation_space.n-1: reward = 1
|
22 |
-
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
23 |
-
Q[action] = expected_value
|
24 |
-
if return_value:
|
25 |
-
return np.argmax(Q), np.max(Q)
|
26 |
-
else:
|
27 |
-
return Q
|
28 |
|
29 |
def train(self):
|
30 |
i = 0
|
@@ -32,27 +23,36 @@ class DP:
|
|
32 |
delta = 0
|
33 |
V_prev = np.copy(self.V)
|
34 |
for state in range(self.env.observation_space.n):
|
35 |
-
action
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
self.V[state] = value
|
37 |
delta = max(delta, abs(V_prev[state] - self.V[state]))
|
38 |
if delta < self.theta:
|
39 |
break
|
40 |
i += 1
|
|
|
41 |
print(f"Iteration {i}: delta={delta}")
|
42 |
# break
|
43 |
|
44 |
-
policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
"""
|
55 |
-
self.V = np.load(pth)
|
56 |
|
57 |
def generate_episode(self, max_steps, render=False, **kwargs):
|
58 |
state, _ = self.env.reset()
|
|
|
12 |
self.theta = theta
|
13 |
self.V = np.zeros(self.env.observation_space.n)
|
14 |
self.epsilon = 0
|
15 |
+
self.Pi = None
|
16 |
|
17 |
+
def policy(self, state):
|
18 |
+
return self.Pi[state]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def train(self):
|
21 |
i = 0
|
|
|
23 |
delta = 0
|
24 |
V_prev = np.copy(self.V)
|
25 |
for state in range(self.env.observation_space.n):
|
26 |
+
# calculate the action-value for each possible action
|
27 |
+
Q = np.zeros(self.env.action_space.n)
|
28 |
+
for action in range(self.env.action_space.n):
|
29 |
+
expected_value = 0
|
30 |
+
for probability, next_state, reward, done in self.env.P[state][action]:
|
31 |
+
if state == self.env.observation_space.n-1: reward = 1
|
32 |
+
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
33 |
+
Q[action] = expected_value
|
34 |
+
action, value = np.argmax(Q), np.max(Q)
|
35 |
+
|
36 |
+
# update the state-value function
|
37 |
self.V[state] = value
|
38 |
delta = max(delta, abs(V_prev[state] - self.V[state]))
|
39 |
if delta < self.theta:
|
40 |
break
|
41 |
i += 1
|
42 |
+
# self.test()
|
43 |
print(f"Iteration {i}: delta={delta}")
|
44 |
# break
|
45 |
|
46 |
+
# policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
|
47 |
+
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
48 |
+
for s in range(self.env.observation_space.n):
|
49 |
+
for a in range(self.env.action_space.n):
|
50 |
+
expected_value = 0
|
51 |
+
for probability, next_state, reward, done in self.env.P[s][a]:
|
52 |
+
# if state == self.env.observation_space.n-1: reward = 1
|
53 |
+
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
54 |
+
self.Pi[s,a] = expected_value
|
55 |
+
# return self.V, self.Pi
|
|
|
|
|
56 |
|
57 |
def generate_episode(self, max_steps, render=False, **kwargs):
|
58 |
state, _ = self.env.reset()
|