Andrei Cozma commited on
Commit
4a6d8ec
·
1 Parent(s): 6a48762
Files changed (2) hide show
  1. AgentBase.py +0 -1
  2. MCAgent.py +20 -10
AgentBase.py CHANGED
@@ -93,7 +93,6 @@ class AgentBase:
93
 
94
  def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
95
  state, _ = self.env.reset()
96
- # action = self.choose_action(state, **kwargs)
97
  episode_hist, solved, done = [], False, False
98
  rgb_array = self.env.render() if render else None
99
 
 
93
 
94
  def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
95
  state, _ = self.env.reset()
 
96
  episode_hist, solved, done = [], False, False
97
  rgb_array = self.env.render() if render else None
98
 
MCAgent.py CHANGED
@@ -16,8 +16,12 @@ class MCAgent(AgentBase):
16
  def initialize(self):
17
  print("Resetting all state variables...")
18
  # The Q-Table holds the current expected return for each state-action pair
19
- self.Q = np.random.rand(self.n_states, self.n_actions)
 
 
20
  # self.Q = np.zeros((self.n_states, self.n_actions))
 
 
21
 
22
  if self.update_type.startswith("on_policy"):
23
  # For On-Policy update type:
@@ -28,11 +32,11 @@ class MCAgent(AgentBase):
28
  elif self.update_type.startswith("off_policy"):
29
  # For Off-Policy update type:
30
  self.C = np.zeros((self.n_states, self.n_actions))
31
- # Target policy is greedy with respect to the current Q
32
  self.Pi = np.zeros((self.n_states, self.n_actions))
33
  self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
34
  # Behavior policy is e-greedy with respect to the current Q
35
- self.Pi_behaviour = self.create_soft_policy(random=False)
36
  else:
37
  raise ValueError(
38
  f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
@@ -42,16 +46,22 @@ class MCAgent(AgentBase):
42
  print(self.Pi)
43
  print("=" * 80)
44
 
45
- def create_soft_policy(self, random=True):
46
- # An arbitrary e-greedy policy:
 
 
 
 
47
  # With probability epsilon, sample an action uniformly at random
48
  Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
49
- # For the initial policy, we randomly select a greedy action for each state
 
 
50
  Pi[
51
  np.arange(self.n_states),
52
  np.random.randint(self.n_actions, size=self.n_states)
53
- if random
54
- else np.argmax(self.Q, axis=1),
55
  ] = (
56
  1.0 - self.epsilon + self.epsilon / self.n_actions
57
  )
@@ -110,13 +120,13 @@ class MCAgent(AgentBase):
110
  greedy_action = np.argmax(self.Q[state])
111
  self.Pi[state] = np.zeros(self.n_actions)
112
  self.Pi[state, greedy_action] = 1.0
113
- # if At != At*, then break
114
  if action != greedy_action:
115
  break
116
  W = W * (1.0 / self.Pi_behaviour[state, action])
117
 
118
  # Update the behavior policy such that it has coverage of the target policy
119
- self.Pi_behaviour = self.create_soft_policy(random=False)
120
 
121
  def train(
122
  self,
 
16
  def initialize(self):
17
  print("Resetting all state variables...")
18
  # The Q-Table holds the current expected return for each state-action pair
19
+ # random uniform initialization
20
+ self.Q = np.random.uniform(-1, 1, size=(self.n_states, self.n_actions))
21
+ # other alternatives:
22
  # self.Q = np.zeros((self.n_states, self.n_actions))
23
+ # self.Q = np.random.rand(self.n_states, self.n_actions)
24
+ # self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
25
 
26
  if self.update_type.startswith("on_policy"):
27
  # For On-Policy update type:
 
32
  elif self.update_type.startswith("off_policy"):
33
  # For Off-Policy update type:
34
  self.C = np.zeros((self.n_states, self.n_actions))
35
+ # Target policy is greedy with respect to the current Q (ties broken consistently)
36
  self.Pi = np.zeros((self.n_states, self.n_actions))
37
  self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
38
  # Behavior policy is e-greedy with respect to the current Q
39
+ self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
40
  else:
41
  raise ValueError(
42
  f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
 
46
  print(self.Pi)
47
  print("=" * 80)
48
 
49
+ def create_soft_policy(self, coverage_policy=None):
50
+ """
51
+ Create a soft policy (epsilon-greedy).
52
+ If coverage_policy is None, the soft policy is initialized randomly.
53
+ Otherwise, the soft policy is e-greedy with respect to the coverage policy. (useful for off-policy)
54
+ """
55
  # With probability epsilon, sample an action uniformly at random
56
  Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
57
+ # The greedy action receives the remaining probability mass
58
+ # If coverage_policy is not provided, the greedy action is sampled randomly
59
+ # Otherwise we give the remaining probability mass according to the coverage policy
60
  Pi[
61
  np.arange(self.n_states),
62
  np.random.randint(self.n_actions, size=self.n_states)
63
+ if coverage_policy is None
64
+ else np.argmax(coverage_policy, axis=1),
65
  ] = (
66
  1.0 - self.epsilon + self.epsilon / self.n_actions
67
  )
 
120
  greedy_action = np.argmax(self.Q[state])
121
  self.Pi[state] = np.zeros(self.n_actions)
122
  self.Pi[state, greedy_action] = 1.0
123
+ # If the greedy action is not the action taken by the behavior policy, then break
124
  if action != greedy_action:
125
  break
126
  W = W * (1.0 / self.Pi_behaviour[state, action])
127
 
128
  # Update the behavior policy such that it has coverage of the target policy
129
+ self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
130
 
131
  def train(
132
  self,