Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
b11da78
1
Parent(s):
0a58f79
Updates
Browse files- MCAgent.py +9 -11
MCAgent.py
CHANGED
@@ -34,34 +34,32 @@ class MCAgent(AgentBase):
|
|
34 |
# For each step of the episode, in reverse order
|
35 |
for t in range(len(episode_hist) - 1, -1, -1):
|
36 |
state, action, reward = episode_hist[t]
|
37 |
-
#
|
38 |
G = self.gamma * G + reward
|
39 |
-
#
|
40 |
-
#
|
41 |
if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
|
42 |
self.R[state][action].append(G)
|
43 |
self.Q[state, action] = np.mean(self.R[state][action])
|
44 |
-
#
|
45 |
self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
46 |
-
# the greedy action is the one with the highest Q-value
|
47 |
self.Pi[state, np.argmax(self.Q[state])] = (
|
48 |
1 - self.epsilon + self.epsilon / self.n_actions
|
49 |
)
|
50 |
|
51 |
def update_every_visit(self, episode_hist):
|
52 |
G = 0
|
53 |
-
#
|
54 |
for t in range(len(episode_hist) - 1, -1, -1):
|
55 |
state, action, reward = episode_hist[t]
|
56 |
-
#
|
57 |
G = self.gamma * G + reward
|
58 |
-
#
|
59 |
-
#
|
60 |
self.R[state][action].append(G)
|
61 |
self.Q[state, action] = np.mean(self.R[state][action])
|
62 |
-
#
|
63 |
self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
64 |
-
# the greedy action is the one with the highest Q-value
|
65 |
self.Pi[state, np.argmax(self.Q[state])] = (
|
66 |
1 - self.epsilon + self.epsilon / self.n_actions
|
67 |
)
|
|
|
34 |
# For each step of the episode, in reverse order
|
35 |
for t in range(len(episode_hist) - 1, -1, -1):
|
36 |
state, action, reward = episode_hist[t]
|
37 |
+
# Updating the expected return
|
38 |
G = self.gamma * G + reward
|
39 |
+
# First-visit MC method:
|
40 |
+
# Only update if we have not visited this state-action pair before
|
41 |
if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
|
42 |
self.R[state][action].append(G)
|
43 |
self.Q[state, action] = np.mean(self.R[state][action])
|
44 |
+
# Updating the epsilon-greedy policy.
|
45 |
self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
|
|
46 |
self.Pi[state, np.argmax(self.Q[state])] = (
|
47 |
1 - self.epsilon + self.epsilon / self.n_actions
|
48 |
)
|
49 |
|
50 |
def update_every_visit(self, episode_hist):
|
51 |
G = 0
|
52 |
+
# Backward pass through the trajectory
|
53 |
for t in range(len(episode_hist) - 1, -1, -1):
|
54 |
state, action, reward = episode_hist[t]
|
55 |
+
# Updating the expected return
|
56 |
G = self.gamma * G + reward
|
57 |
+
# Every-visit MC method:
|
58 |
+
# Update the expected return for every visit to this state-action pair
|
59 |
self.R[state][action].append(G)
|
60 |
self.Q[state, action] = np.mean(self.R[state][action])
|
61 |
+
# Updating the epsilon-greedy policy.
|
62 |
self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
|
|
63 |
self.Pi[state, np.argmax(self.Q[state])] = (
|
64 |
1 - self.epsilon + self.epsilon / self.n_actions
|
65 |
)
|