Andrei Cozma commited on
Commit
b11da78
·
1 Parent(s): 0a58f79
Files changed (1) hide show
  1. MCAgent.py +9 -11
MCAgent.py CHANGED
@@ -34,34 +34,32 @@ class MCAgent(AgentBase):
34
  # For each step of the episode, in reverse order
35
  for t in range(len(episode_hist) - 1, -1, -1):
36
  state, action, reward = episode_hist[t]
37
- # Update the expected return
38
  G = self.gamma * G + reward
39
- # If we haven't already visited this state-action pair up to this point, then we can update the Q-table and policy
40
- # This is the first-visit MC method
41
  if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
42
  self.R[state][action].append(G)
43
  self.Q[state, action] = np.mean(self.R[state][action])
44
- # Epsilon-greedy policy update
45
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
46
- # the greedy action is the one with the highest Q-value
47
  self.Pi[state, np.argmax(self.Q[state])] = (
48
  1 - self.epsilon + self.epsilon / self.n_actions
49
  )
50
 
51
  def update_every_visit(self, episode_hist):
52
  G = 0
53
- # For each step of the episode, in reverse order
54
  for t in range(len(episode_hist) - 1, -1, -1):
55
  state, action, reward = episode_hist[t]
56
- # Update the expected return
57
  G = self.gamma * G + reward
58
- # We update the Q-table and policy even if we have visited this state-action pair before
59
- # This is the every-visit MC method
60
  self.R[state][action].append(G)
61
  self.Q[state, action] = np.mean(self.R[state][action])
62
- # Epsilon-greedy policy update
63
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
64
- # the greedy action is the one with the highest Q-value
65
  self.Pi[state, np.argmax(self.Q[state])] = (
66
  1 - self.epsilon + self.epsilon / self.n_actions
67
  )
 
34
  # For each step of the episode, in reverse order
35
  for t in range(len(episode_hist) - 1, -1, -1):
36
  state, action, reward = episode_hist[t]
37
+ # Updating the expected return
38
  G = self.gamma * G + reward
39
+ # First-visit MC method:
40
+ # Only update if we have not visited this state-action pair before
41
  if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
42
  self.R[state][action].append(G)
43
  self.Q[state, action] = np.mean(self.R[state][action])
44
+ # Updating the epsilon-greedy policy.
45
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
 
46
  self.Pi[state, np.argmax(self.Q[state])] = (
47
  1 - self.epsilon + self.epsilon / self.n_actions
48
  )
49
 
50
  def update_every_visit(self, episode_hist):
51
  G = 0
52
+ # Backward pass through the trajectory
53
  for t in range(len(episode_hist) - 1, -1, -1):
54
  state, action, reward = episode_hist[t]
55
+ # Updating the expected return
56
  G = self.gamma * G + reward
57
+ # Every-visit MC method:
58
+ # Update the expected return for every visit to this state-action pair
59
  self.R[state][action].append(G)
60
  self.Q[state, action] = np.mean(self.R[state][action])
61
+ # Updating the epsilon-greedy policy.
62
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
 
63
  self.Pi[state, np.argmax(self.Q[state])] = (
64
  1 - self.epsilon + self.epsilon / self.n_actions
65
  )