Andrei Cozma commited on
Commit
35d83a8
·
1 Parent(s): b11da78
Files changed (1) hide show
  1. MCAgent.py +13 -6
MCAgent.py CHANGED
@@ -7,17 +7,20 @@ from AgentBase import AgentBase
7
  class MCAgent(AgentBase):
8
  def __init__(self, /, **kwargs):
9
  super().__init__(run_name=self.__class__.__name__, **kwargs)
10
- self.reset()
11
 
12
- def reset(self):
13
  print("Resetting all state variables...")
 
14
  self.Q = np.zeros((self.n_states, self.n_actions))
 
15
  self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
16
-
17
- # An arbitrary e-greedy policy
18
  self.Pi = np.full(
19
  (self.n_states, self.n_actions), self.epsilon / self.n_actions
20
  )
 
21
  self.Pi[
22
  np.arange(self.n_states),
23
  np.random.randint(self.n_actions, size=self.n_states),
@@ -37,12 +40,14 @@ class MCAgent(AgentBase):
37
  # Updating the expected return
38
  G = self.gamma * G + reward
39
  # First-visit MC method:
40
- # Only update if we have not visited this state-action pair before
41
  if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
42
  self.R[state][action].append(G)
43
  self.Q[state, action] = np.mean(self.R[state][action])
44
  # Updating the epsilon-greedy policy.
 
45
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
 
46
  self.Pi[state, np.argmax(self.Q[state])] = (
47
  1 - self.epsilon + self.epsilon / self.n_actions
48
  )
@@ -55,11 +60,13 @@ class MCAgent(AgentBase):
55
  # Updating the expected return
56
  G = self.gamma * G + reward
57
  # Every-visit MC method:
58
- # Update the expected return for every visit to this state-action pair
59
  self.R[state][action].append(G)
60
  self.Q[state, action] = np.mean(self.R[state][action])
61
  # Updating the epsilon-greedy policy.
 
62
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
 
63
  self.Pi[state, np.argmax(self.Q[state])] = (
64
  1 - self.epsilon + self.epsilon / self.n_actions
65
  )
 
7
  class MCAgent(AgentBase):
8
  def __init__(self, /, **kwargs):
9
  super().__init__(run_name=self.__class__.__name__, **kwargs)
10
+ self.initialize()
11
 
12
+ def initialize(self):
13
  print("Resetting all state variables...")
14
+ # The Q-Table holds the current expected return for each state-action pair
15
  self.Q = np.zeros((self.n_states, self.n_actions))
16
+ # R keeps track of all the returns that have been observed for each state-action pair to update Q
17
  self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
18
+ # An arbitrary e-greedy policy:
19
+ # With probability epsilon, sample an action uniformly at random
20
  self.Pi = np.full(
21
  (self.n_states, self.n_actions), self.epsilon / self.n_actions
22
  )
23
+ # The greedy action receives the remaining probability mass
24
  self.Pi[
25
  np.arange(self.n_states),
26
  np.random.randint(self.n_actions, size=self.n_states),
 
40
  # Updating the expected return
41
  G = self.gamma * G + reward
42
  # First-visit MC method:
43
+ # Updating the expected return and policy only if this is the first visit to this state-action pair
44
  if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
45
  self.R[state][action].append(G)
46
  self.Q[state, action] = np.mean(self.R[state][action])
47
  # Updating the epsilon-greedy policy.
48
+ # With probability epsilon, sample an action uniformly at random
49
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
50
+ # The greedy action receives the remaining probability mass
51
  self.Pi[state, np.argmax(self.Q[state])] = (
52
  1 - self.epsilon + self.epsilon / self.n_actions
53
  )
 
60
  # Updating the expected return
61
  G = self.gamma * G + reward
62
  # Every-visit MC method:
63
+ # Updating the expected return and policy for every visit to this state-action pair
64
  self.R[state][action].append(G)
65
  self.Q[state, action] = np.mean(self.R[state][action])
66
  # Updating the epsilon-greedy policy.
67
+ # With probability epsilon, sample an action uniformly at random
68
  self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
69
+ # The greedy action receives the remaining probability mass
70
  self.Pi[state, np.argmax(self.Q[state])] = (
71
  1 - self.epsilon + self.epsilon / self.n_actions
72
  )