Andrei Cozma commited on
Commit
6a48762
·
1 Parent(s): 50efa30
Files changed (4) hide show
  1. AgentBase.py +14 -8
  2. DPAgent.py +1 -1
  3. MCAgent.py +83 -36
  4. run.py +3 -4
AgentBase.py CHANGED
@@ -62,22 +62,28 @@ class AgentBase:
62
  print(f"- n_states: {self.n_states}")
63
  print(f"- n_actions: {self.n_actions}")
64
 
65
- def choose_action(self, state, greedy=False, **kwargs):
66
  """
67
  Sample an action from the policy.
68
  Also allows the ability to override the epsilon value (for the purpose of the demo)
69
  :param state: The current state
 
70
  :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
71
  :return: The sampled action
72
  """
 
 
 
 
 
73
  # If greedy is True, always return the greedy action
74
- greedy_action = np.argmax(self.Pi[state])
75
  if greedy or self.epsilon_override == 0.0:
76
  return greedy_action
77
 
78
  # Otherwise, sample an action from the soft policy (epsilon-greedy)
79
  if self.epsilon_override is None:
80
- return np.random.choice(self.n_actions, p=self.Pi[state])
81
 
82
  # If we ever want to manually override the epsilon value, it happens here
83
  return np.random.choice(
@@ -85,7 +91,7 @@ class AgentBase:
85
  p=[1.0 - self.epsilon_override, self.epsilon_override],
86
  )
87
 
88
- def generate_episode(self, max_steps=500, render=False, **kwargs):
89
  state, _ = self.env.reset()
90
  # action = self.choose_action(state, **kwargs)
91
  episode_hist, solved, done = [], False, False
@@ -97,7 +103,7 @@ class AgentBase:
97
  # Render the environment if needed
98
  rgb_array = self.env.render() if render else None
99
  # Sample the next action from the policy
100
- action = self.choose_action(state, **kwargs)
101
  # Keeping track of the trajectory
102
  episode_hist.append((state, action, None))
103
  # Take the action and observe the reward and next state
@@ -134,10 +140,10 @@ class AgentBase:
134
  rgb_array = self.env.render() if render else None
135
  yield episode_hist, solved, rgb_array
136
 
137
- def run_episode(self, max_steps=500, render=False, **kwargs):
138
  # Run the generator until the end
139
  episode_hist, solved, rgb_array = list(
140
- self.generate_episode(max_steps, render, **kwargs)
141
  )[-1]
142
  return episode_hist, solved, rgb_array
143
 
@@ -146,7 +152,7 @@ class AgentBase:
146
  print(f"Testing agent for {n_test_episodes} episodes...")
147
  num_successes = 0
148
  for e in range(n_test_episodes):
149
- _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
150
  num_successes += solved
151
  if verbose:
152
  word = "reached" if solved else "did not reach"
 
62
  print(f"- n_states: {self.n_states}")
63
  print(f"- n_actions: {self.n_actions}")
64
 
65
+ def choose_action(self, policy, state, greedy=False, **kwargs):
66
  """
67
  Sample an action from the policy.
68
  Also allows the ability to override the epsilon value (for the purpose of the demo)
69
  :param state: The current state
70
+ :param policy: The policy to sample from. Must be of shape (n_states, n_actions)
71
  :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
72
  :return: The sampled action
73
  """
74
+ assert policy.shape == (self.n_states, self.n_actions), (
75
+ f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
76
+ f"Got {policy.shape}."
77
+ )
78
+
79
  # If greedy is True, always return the greedy action
80
+ greedy_action = np.argmax(policy[state])
81
  if greedy or self.epsilon_override == 0.0:
82
  return greedy_action
83
 
84
  # Otherwise, sample an action from the soft policy (epsilon-greedy)
85
  if self.epsilon_override is None:
86
+ return np.random.choice(self.n_actions, p=policy[state])
87
 
88
  # If we ever want to manually override the epsilon value, it happens here
89
  return np.random.choice(
 
91
  p=[1.0 - self.epsilon_override, self.epsilon_override],
92
  )
93
 
94
+ def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
95
  state, _ = self.env.reset()
96
  # action = self.choose_action(state, **kwargs)
97
  episode_hist, solved, done = [], False, False
 
103
  # Render the environment if needed
104
  rgb_array = self.env.render() if render else None
105
  # Sample the next action from the policy
106
+ action = self.choose_action(policy, state, **kwargs)
107
  # Keeping track of the trajectory
108
  episode_hist.append((state, action, None))
109
  # Take the action and observe the reward and next state
 
140
  rgb_array = self.env.render() if render else None
141
  yield episode_hist, solved, rgb_array
142
 
143
+ def run_episode(self, policy, max_steps=500, render=False, **kwargs):
144
  # Run the generator until the end
145
  episode_hist, solved, rgb_array = list(
146
+ self.generate_episode(policy, max_steps, render, **kwargs)
147
  )[-1]
148
  return episode_hist, solved, rgb_array
149
 
 
152
  print(f"Testing agent for {n_test_episodes} episodes...")
153
  num_successes = 0
154
  for e in range(n_test_episodes):
155
+ _, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
156
  num_successes += solved
157
  if verbose:
158
  word = "reached" if solved else "did not reach"
DPAgent.py CHANGED
@@ -103,7 +103,7 @@ if __name__ == "__main__":
103
  state, _ = env.reset()
104
  done = False
105
  while not done:
106
- action = dp.choose_action(state)
107
  state, reward, done, _, _ = env.step(action)
108
  env.render()
109
 
 
103
  state, _ = env.reset()
104
  done = False
105
  while not done:
106
+ action = dp.choose_action(dp.Pi, state)
107
  state, reward, done, _, _ = env.step(action)
108
  env.render()
109
 
MCAgent.py CHANGED
@@ -5,35 +5,60 @@ from AgentBase import AgentBase
5
 
6
 
7
  class MCAgent(AgentBase):
8
- def __init__(self, /, **kwargs):
 
 
9
  super().__init__(run_name=self.__class__.__name__, **kwargs)
 
 
10
  self.initialize()
11
 
12
  def initialize(self):
13
  print("Resetting all state variables...")
14
  # The Q-Table holds the current expected return for each state-action pair
15
- self.Q = np.zeros((self.n_states, self.n_actions))
16
- # R keeps track of all the returns that have been observed for each state-action pair to update Q
17
- self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # An arbitrary e-greedy policy:
19
  # With probability epsilon, sample an action uniformly at random
20
- self.Pi = np.full(
21
- (self.n_states, self.n_actions), self.epsilon / self.n_actions
22
- )
23
  # For the initial policy, we randomly select a greedy action for each state
24
- self.Pi[
25
  np.arange(self.n_states),
26
- np.random.randint(self.n_actions, size=self.n_states),
 
 
27
  ] = (
28
- 1 - self.epsilon + self.epsilon / self.n_actions
29
  )
30
- print("=" * 80)
31
- print("Initial policy:")
32
- print(self.Pi)
33
- print("=" * 80)
34
 
35
- def update_first_visit(self, episode_hist):
36
- G = 0
37
  # For each step of the episode, in reverse order
38
  for t in range(len(episode_hist) - 1, -1, -1):
39
  state, action, reward = episode_hist[t]
@@ -52,30 +77,51 @@ class MCAgent(AgentBase):
52
  1 - self.epsilon + self.epsilon / self.n_actions
53
  )
54
 
55
- def update_every_visit(self, episode_hist):
56
- G = 0
57
- # Backward pass through the trajectory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  for t in range(len(episode_hist) - 1, -1, -1):
59
  state, action, reward = episode_hist[t]
60
  # Updating the expected return
61
  G = self.gamma * G + reward
62
- # Every-visit MC method:
63
- # Updating the expected return and policy for every visit to this state-action pair
64
- self.R[state][action].append(G)
65
- self.Q[state, action] = np.mean(self.R[state][action])
66
- # Updating the epsilon-greedy policy.
67
- # With probability epsilon, sample an action uniformly at random
68
- self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
69
- # The greedy action receives the remaining probability mass
70
- self.Pi[state, np.argmax(self.Q[state])] = (
71
- 1 - self.epsilon + self.epsilon / self.n_actions
72
- )
 
 
 
 
73
 
74
  def train(
75
  self,
76
  n_train_episodes=2000,
77
  test_every=100,
78
- update_type="first_visit",
79
  log_wandb=False,
80
  save_best=True,
81
  save_best_dir=None,
@@ -83,7 +129,6 @@ class MCAgent(AgentBase):
83
  **kwargs,
84
  ):
85
  print(f"Training agent for {n_train_episodes} episodes...")
86
- self.run_name = f"{self.run_name}_{update_type}"
87
 
88
  (
89
  train_running_success_rate,
@@ -99,7 +144,7 @@ class MCAgent(AgentBase):
99
  "avg_ep_len": avg_ep_len,
100
  }
101
 
102
- update_func = getattr(self, f"update_{update_type}")
103
 
104
  tqrange = tqdm(range(n_train_episodes))
105
  tqrange.set_description("Training")
@@ -108,7 +153,8 @@ class MCAgent(AgentBase):
108
  self.wandb_log_img(episode=None)
109
 
110
  for e in tqrange:
111
- episode_hist, solved, _ = self.run_episode(**kwargs)
 
112
  rewards = [x[2] for x in episode_hist]
113
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
114
 
@@ -129,8 +175,9 @@ class MCAgent(AgentBase):
129
  }
130
  tqrange.set_postfix(stats)
131
 
132
- # Test the agent every test_every episodes with the greedy policy (by default)
133
- if e % test_every == 0:
 
134
  test_success_rate = self.test(verbose=False, **kwargs)
135
  if log_wandb:
136
  self.wandb_log_img(episode=e)
 
5
 
6
 
7
  class MCAgent(AgentBase):
8
+ def __init__(
9
+ self, /, update_type="on-policy", **kwargs # "on-policy" or "off-policy
10
+ ):
11
  super().__init__(run_name=self.__class__.__name__, **kwargs)
12
+ self.update_type = update_type
13
+ self.run_name = f"{self.run_name}_{self.update_type}"
14
  self.initialize()
15
 
16
  def initialize(self):
17
  print("Resetting all state variables...")
18
  # The Q-Table holds the current expected return for each state-action pair
19
+ self.Q = np.random.rand(self.n_states, self.n_actions)
20
+ # self.Q = np.zeros((self.n_states, self.n_actions))
21
+
22
+ if self.update_type.startswith("on_policy"):
23
+ # For On-Policy update type:
24
+ # R keeps track of all the returns that have been observed for each state-action pair to update Q
25
+ self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
26
+ # An arbitrary e-greedy policy:
27
+ self.Pi = self.create_soft_policy()
28
+ elif self.update_type.startswith("off_policy"):
29
+ # For Off-Policy update type:
30
+ self.C = np.zeros((self.n_states, self.n_actions))
31
+ # Target policy is greedy with respect to the current Q
32
+ self.Pi = np.zeros((self.n_states, self.n_actions))
33
+ self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
34
+ # Behavior policy is e-greedy with respect to the current Q
35
+ self.Pi_behaviour = self.create_soft_policy(random=False)
36
+ else:
37
+ raise ValueError(
38
+ f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
39
+ )
40
+ print("=" * 80)
41
+ print("Initial policy:")
42
+ print(self.Pi)
43
+ print("=" * 80)
44
+
45
+ def create_soft_policy(self, random=True):
46
  # An arbitrary e-greedy policy:
47
  # With probability epsilon, sample an action uniformly at random
48
+ Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
 
 
49
  # For the initial policy, we randomly select a greedy action for each state
50
+ Pi[
51
  np.arange(self.n_states),
52
+ np.random.randint(self.n_actions, size=self.n_states)
53
+ if random
54
+ else np.argmax(self.Q, axis=1),
55
  ] = (
56
+ 1.0 - self.epsilon + self.epsilon / self.n_actions
57
  )
58
+ return Pi
 
 
 
59
 
60
+ def update_on_policy(self, episode_hist):
61
+ G = 0.0
62
  # For each step of the episode, in reverse order
63
  for t in range(len(episode_hist) - 1, -1, -1):
64
  state, action, reward = episode_hist[t]
 
77
  1 - self.epsilon + self.epsilon / self.n_actions
78
  )
79
 
80
+ # def update_every_visit(self, episode_hist):
81
+ # G = 0
82
+ # # Backward pass through the trajectory
83
+ # for t in range(len(episode_hist) - 1, -1, -1):
84
+ # state, action, reward = episode_hist[t]
85
+ # # Updating the expected return
86
+ # G = self.gamma * G + reward
87
+ # # Every-visit MC method:
88
+ # # Updating the expected return and policy for every visit to this state-action pair
89
+ # self.R[state][action].append(G)
90
+ # self.Q[state, action] = np.mean(self.R[state][action])
91
+ # # Updating the epsilon-greedy policy.
92
+ # # With probability epsilon, sample an action uniformly at random
93
+ # self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
94
+ # # The greedy action receives the remaining probability mass
95
+ # self.Pi[state, np.argmax(self.Q[state])] = (
96
+ # 1 - self.epsilon + self.epsilon / self.n_actions
97
+ # )
98
+
99
+ def update_off_policy(self, episode_hist):
100
+ G, W = 0.0, 1.0
101
  for t in range(len(episode_hist) - 1, -1, -1):
102
  state, action, reward = episode_hist[t]
103
  # Updating the expected return
104
  G = self.gamma * G + reward
105
+ self.C[state, action] = self.C[state, action] + W
106
+ self.Q[state, action] = self.Q[state, action] + (
107
+ W / self.C[state, action]
108
+ ) * (G - self.Q[state, action])
109
+ # Updating the target policy to be greedy with respect to the current Q
110
+ greedy_action = np.argmax(self.Q[state])
111
+ self.Pi[state] = np.zeros(self.n_actions)
112
+ self.Pi[state, greedy_action] = 1.0
113
+ # if At != At*, then break
114
+ if action != greedy_action:
115
+ break
116
+ W = W * (1.0 / self.Pi_behaviour[state, action])
117
+
118
+ # Update the behavior policy such that it has coverage of the target policy
119
+ self.Pi_behaviour = self.create_soft_policy(random=False)
120
 
121
  def train(
122
  self,
123
  n_train_episodes=2000,
124
  test_every=100,
 
125
  log_wandb=False,
126
  save_best=True,
127
  save_best_dir=None,
 
129
  **kwargs,
130
  ):
131
  print(f"Training agent for {n_train_episodes} episodes...")
 
132
 
133
  (
134
  train_running_success_rate,
 
144
  "avg_ep_len": avg_ep_len,
145
  }
146
 
147
+ update_func = getattr(self, f"update_{self.update_type}")
148
 
149
  tqrange = tqdm(range(n_train_episodes))
150
  tqrange.set_description("Training")
 
153
  self.wandb_log_img(episode=None)
154
 
155
  for e in tqrange:
156
+ policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
157
+ episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
158
  rewards = [x[2] for x in episode_hist]
159
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
160
 
 
175
  }
176
  tqrange.set_postfix(stats)
177
 
178
+ # Test the agent every test_every episodes
179
+ if test_every > 0 and e % test_every == 0:
180
+ # For off policy, self.Pi is the target policy. For on policy, self.Pi is the soft policy
181
  test_success_rate = self.test(verbose=False, **kwargs)
182
  if log_wandb:
183
  self.wandb_log_img(episode=e)
run.py CHANGED
@@ -68,9 +68,9 @@ def main():
68
  parser.add_argument(
69
  "--update_type",
70
  type=str,
71
- choices=["first_visit", "every_visit"],
72
- default="first_visit",
73
- help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
74
  )
75
 
76
  ### Environment parameters
@@ -159,7 +159,6 @@ def main():
159
  test_every=args.test_every,
160
  n_test_episodes=args.n_test_episodes,
161
  max_steps=args.max_steps,
162
- update_type=args.update_type,
163
  log_wandb=args.wandb_project is not None,
164
  save_best=True,
165
  save_best_dir=args.save_dir,
 
68
  parser.add_argument(
69
  "--update_type",
70
  type=str,
71
+ choices=["on_policy", "off_policy"],
72
+ default="off_policy",
73
+ help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
74
  )
75
 
76
  ### Environment parameters
 
159
  test_every=args.test_every,
160
  n_test_episodes=args.n_test_episodes,
161
  max_steps=args.max_steps,
 
162
  log_wandb=args.wandb_project is not None,
163
  save_best=True,
164
  save_best_dir=args.save_dir,