lharri73 commited on
Commit
2a6763e
·
2 Parent(s): e0c3c75 120dc90

Merge branch 'main' of github.com:andreicozma1/CS581-Algorithms-Project

Browse files
Files changed (5) hide show
  1. AgentBase.py +19 -9
  2. DPAgent.py +1 -1
  3. MCAgent.py +95 -38
  4. run.py +5 -6
  5. test_params.py +38 -13
AgentBase.py CHANGED
@@ -62,22 +62,28 @@ class AgentBase:
62
  print(f"- n_states: {self.n_states}")
63
  print(f"- n_actions: {self.n_actions}")
64
 
65
- def choose_action(self, state, greedy=False, **kwargs):
66
  """
67
  Sample an action from the policy.
68
  Also allows the ability to override the epsilon value (for the purpose of the demo)
69
  :param state: The current state
 
70
  :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
71
  :return: The sampled action
72
  """
 
 
 
 
 
73
  # If greedy is True, always return the greedy action
74
- greedy_action = np.argmax(self.Pi[state])
75
  if greedy or self.epsilon_override == 0.0:
76
  return greedy_action
77
 
78
  # Otherwise, sample an action from the soft policy (epsilon-greedy)
79
  if self.epsilon_override is None:
80
- return np.random.choice(self.n_actions, p=self.Pi[state])
81
 
82
  # If we ever want to manually override the epsilon value, it happens here
83
  return np.random.choice(
@@ -85,9 +91,13 @@ class AgentBase:
85
  p=[1.0 - self.epsilon_override, self.epsilon_override],
86
  )
87
 
88
- def generate_episode(self, max_steps=500, render=False, **kwargs):
 
 
 
 
 
89
  state, _ = self.env.reset()
90
- # action = self.choose_action(state, **kwargs)
91
  episode_hist, solved, done = [], False, False
92
  rgb_array = self.env.render() if render else None
93
 
@@ -97,7 +107,7 @@ class AgentBase:
97
  # Render the environment if needed
98
  rgb_array = self.env.render() if render else None
99
  # Sample the next action from the policy
100
- action = self.choose_action(state, **kwargs)
101
  # Keeping track of the trajectory
102
  episode_hist.append((state, action, None))
103
  # Take the action and observe the reward and next state
@@ -134,10 +144,10 @@ class AgentBase:
134
  rgb_array = self.env.render() if render else None
135
  yield episode_hist, solved, rgb_array
136
 
137
- def run_episode(self, max_steps=500, render=False, **kwargs):
138
  # Run the generator until the end
139
  episode_hist, solved, rgb_array = list(
140
- self.generate_episode(max_steps, render, **kwargs)
141
  )[-1]
142
  return episode_hist, solved, rgb_array
143
 
@@ -146,7 +156,7 @@ class AgentBase:
146
  print(f"Testing agent for {n_test_episodes} episodes...")
147
  num_successes = 0
148
  for e in range(n_test_episodes):
149
- _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
150
  num_successes += solved
151
  if verbose:
152
  word = "reached" if solved else "did not reach"
 
62
  print(f"- n_states: {self.n_states}")
63
  print(f"- n_actions: {self.n_actions}")
64
 
65
+ def choose_action(self, policy, state, greedy=False, **kwargs):
66
  """
67
  Sample an action from the policy.
68
  Also allows the ability to override the epsilon value (for the purpose of the demo)
69
  :param state: The current state
70
+ :param policy: The policy to sample from. Must be of shape (n_states, n_actions)
71
  :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
72
  :return: The sampled action
73
  """
74
+ assert policy.shape == (self.n_states, self.n_actions), (
75
+ f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
76
+ f"Got {policy.shape}."
77
+ )
78
+
79
  # If greedy is True, always return the greedy action
80
+ greedy_action = np.argmax(policy[state])
81
  if greedy or self.epsilon_override == 0.0:
82
  return greedy_action
83
 
84
  # Otherwise, sample an action from the soft policy (epsilon-greedy)
85
  if self.epsilon_override is None:
86
+ return np.random.choice(self.n_actions, p=policy[state])
87
 
88
  # If we ever want to manually override the epsilon value, it happens here
89
  return np.random.choice(
 
91
  p=[1.0 - self.epsilon_override, self.epsilon_override],
92
  )
93
 
94
+ def generate_episode(self, policy, max_steps=None, render=False, **kwargs):
95
+ if max_steps is None:
96
+ # If max_steps is not specified, we use a rough estimate of
97
+ # the maximum number of steps it should take to solve the environment
98
+ max_steps = self.n_states * self.n_actions
99
+
100
  state, _ = self.env.reset()
 
101
  episode_hist, solved, done = [], False, False
102
  rgb_array = self.env.render() if render else None
103
 
 
107
  # Render the environment if needed
108
  rgb_array = self.env.render() if render else None
109
  # Sample the next action from the policy
110
+ action = self.choose_action(policy, state, **kwargs)
111
  # Keeping track of the trajectory
112
  episode_hist.append((state, action, None))
113
  # Take the action and observe the reward and next state
 
144
  rgb_array = self.env.render() if render else None
145
  yield episode_hist, solved, rgb_array
146
 
147
+ def run_episode(self, policy, max_steps=None, render=False, **kwargs):
148
  # Run the generator until the end
149
  episode_hist, solved, rgb_array = list(
150
+ self.generate_episode(policy, max_steps, render, **kwargs)
151
  )[-1]
152
  return episode_hist, solved, rgb_array
153
 
 
156
  print(f"Testing agent for {n_test_episodes} episodes...")
157
  num_successes = 0
158
  for e in range(n_test_episodes):
159
+ _, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
160
  num_successes += solved
161
  if verbose:
162
  word = "reached" if solved else "did not reach"
DPAgent.py CHANGED
@@ -125,7 +125,7 @@ if __name__ == "__main__":
125
  state, _ = env.reset()
126
  done = False
127
  while not done:
128
- action = dp.choose_action(state, greedy=True)
129
  state, reward, done, _, _ = env.step(action)
130
  s = env.render()
131
  print(s)
 
125
  state, _ = env.reset()
126
  done = False
127
  while not done:
128
+ action = dp.choose_action(dp.Pi, state)
129
  state, reward, done, _, _ = env.step(action)
130
  s = env.render()
131
  print(s)
MCAgent.py CHANGED
@@ -5,35 +5,70 @@ from AgentBase import AgentBase
5
 
6
 
7
  class MCAgent(AgentBase):
8
- def __init__(self, /, **kwargs):
 
 
9
  super().__init__(run_name=self.__class__.__name__, **kwargs)
 
 
10
  self.initialize()
11
 
12
  def initialize(self):
13
  print("Resetting all state variables...")
14
  # The Q-Table holds the current expected return for each state-action pair
15
- self.Q = np.zeros((self.n_states, self.n_actions))
16
- # R keeps track of all the returns that have been observed for each state-action pair to update Q
17
- self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
18
- # An arbitrary e-greedy policy:
19
- # With probability epsilon, sample an action uniformly at random
20
- self.Pi = np.full(
21
- (self.n_states, self.n_actions), self.epsilon / self.n_actions
22
- )
23
- # For the initial policy, we randomly select a greedy action for each state
24
- self.Pi[
25
- np.arange(self.n_states),
26
- np.random.randint(self.n_actions, size=self.n_states),
27
- ] = (
28
- 1 - self.epsilon + self.epsilon / self.n_actions
29
- )
 
 
 
 
 
 
 
 
 
 
30
  print("=" * 80)
31
  print("Initial policy:")
32
  print(self.Pi)
33
  print("=" * 80)
34
 
35
- def update_first_visit(self, episode_hist):
36
- G = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # For each step of the episode, in reverse order
38
  for t in range(len(episode_hist) - 1, -1, -1):
39
  state, action, reward = episode_hist[t]
@@ -52,30 +87,51 @@ class MCAgent(AgentBase):
52
  1 - self.epsilon + self.epsilon / self.n_actions
53
  )
54
 
55
- def update_every_visit(self, episode_hist):
56
- G = 0
57
- # Backward pass through the trajectory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  for t in range(len(episode_hist) - 1, -1, -1):
59
  state, action, reward = episode_hist[t]
60
  # Updating the expected return
61
  G = self.gamma * G + reward
62
- # Every-visit MC method:
63
- # Updating the expected return and policy for every visit to this state-action pair
64
- self.R[state][action].append(G)
65
- self.Q[state, action] = np.mean(self.R[state][action])
66
- # Updating the epsilon-greedy policy.
67
- # With probability epsilon, sample an action uniformly at random
68
- self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
69
- # The greedy action receives the remaining probability mass
70
- self.Pi[state, np.argmax(self.Q[state])] = (
71
- 1 - self.epsilon + self.epsilon / self.n_actions
72
- )
 
 
 
 
73
 
74
  def train(
75
  self,
76
  n_train_episodes=2000,
77
  test_every=100,
78
- update_type="first_visit",
79
  log_wandb=False,
80
  save_best=True,
81
  save_best_dir=None,
@@ -83,7 +139,6 @@ class MCAgent(AgentBase):
83
  **kwargs,
84
  ):
85
  print(f"Training agent for {n_train_episodes} episodes...")
86
- self.run_name = f"{self.run_name}_{update_type}"
87
 
88
  (
89
  train_running_success_rate,
@@ -99,7 +154,7 @@ class MCAgent(AgentBase):
99
  "avg_ep_len": avg_ep_len,
100
  }
101
 
102
- update_func = getattr(self, f"update_{update_type}")
103
 
104
  tqrange = tqdm(range(n_train_episodes))
105
  tqrange.set_description("Training")
@@ -108,7 +163,8 @@ class MCAgent(AgentBase):
108
  self.wandb_log_img(episode=None)
109
 
110
  for e in tqrange:
111
- episode_hist, solved, _ = self.run_episode(**kwargs)
 
112
  rewards = [x[2] for x in episode_hist]
113
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
114
 
@@ -129,8 +185,9 @@ class MCAgent(AgentBase):
129
  }
130
  tqrange.set_postfix(stats)
131
 
132
- # Test the agent every test_every episodes with the greedy policy (by default)
133
- if e % test_every == 0:
 
134
  test_success_rate = self.test(verbose=False, **kwargs)
135
  if log_wandb:
136
  self.wandb_log_img(episode=e)
 
5
 
6
 
7
  class MCAgent(AgentBase):
8
+ def __init__(
9
+ self, /, update_type="on-policy", **kwargs # "on-policy" or "off-policy
10
+ ):
11
  super().__init__(run_name=self.__class__.__name__, **kwargs)
12
+ self.update_type = update_type
13
+ self.run_name = f"{self.run_name}_{self.update_type}"
14
  self.initialize()
15
 
16
  def initialize(self):
17
  print("Resetting all state variables...")
18
  # The Q-Table holds the current expected return for each state-action pair
19
+ # random uniform initialization
20
+ self.Q = np.random.uniform(-1, 1, size=(self.n_states, self.n_actions))
21
+ # other alternatives:
22
+ # self.Q = np.zeros((self.n_states, self.n_actions))
23
+ # self.Q = np.random.rand(self.n_states, self.n_actions)
24
+ # self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
25
+
26
+ if self.update_type.startswith("on_policy"):
27
+ # For On-Policy update type:
28
+ # R keeps track of all the returns that have been observed for each state-action pair to update Q
29
+ self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
30
+ # An arbitrary e-greedy policy:
31
+ self.Pi = self.create_soft_policy()
32
+ elif self.update_type.startswith("off_policy"):
33
+ # For Off-Policy update type:
34
+ self.C = np.zeros((self.n_states, self.n_actions))
35
+ # Target policy is greedy with respect to the current Q (ties broken consistently)
36
+ self.Pi = np.zeros((self.n_states, self.n_actions))
37
+ self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
38
+ # Behavior policy is e-greedy with respect to the current Q
39
+ self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
40
+ else:
41
+ raise ValueError(
42
+ f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
43
+ )
44
  print("=" * 80)
45
  print("Initial policy:")
46
  print(self.Pi)
47
  print("=" * 80)
48
 
49
+ def create_soft_policy(self, coverage_policy=None):
50
+ """
51
+ Create a soft policy (epsilon-greedy).
52
+ If coverage_policy is None, the soft policy is initialized randomly.
53
+ Otherwise, the soft policy is e-greedy with respect to the coverage policy. (useful for off-policy)
54
+ """
55
+ # With probability epsilon, sample an action uniformly at random
56
+ Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
57
+ # The greedy action receives the remaining probability mass
58
+ # If coverage_policy is not provided, the greedy action is sampled randomly
59
+ # Otherwise we give the remaining probability mass according to the coverage policy
60
+ Pi[
61
+ np.arange(self.n_states),
62
+ np.random.randint(self.n_actions, size=self.n_states)
63
+ if coverage_policy is None
64
+ else np.argmax(coverage_policy, axis=1),
65
+ ] = (
66
+ 1.0 - self.epsilon + self.epsilon / self.n_actions
67
+ )
68
+ return Pi
69
+
70
+ def update_on_policy(self, episode_hist):
71
+ G = 0.0
72
  # For each step of the episode, in reverse order
73
  for t in range(len(episode_hist) - 1, -1, -1):
74
  state, action, reward = episode_hist[t]
 
87
  1 - self.epsilon + self.epsilon / self.n_actions
88
  )
89
 
90
+ # def update_every_visit(self, episode_hist):
91
+ # G = 0
92
+ # # Backward pass through the trajectory
93
+ # for t in range(len(episode_hist) - 1, -1, -1):
94
+ # state, action, reward = episode_hist[t]
95
+ # # Updating the expected return
96
+ # G = self.gamma * G + reward
97
+ # # Every-visit MC method:
98
+ # # Updating the expected return and policy for every visit to this state-action pair
99
+ # self.R[state][action].append(G)
100
+ # self.Q[state, action] = np.mean(self.R[state][action])
101
+ # # Updating the epsilon-greedy policy.
102
+ # # With probability epsilon, sample an action uniformly at random
103
+ # self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
104
+ # # The greedy action receives the remaining probability mass
105
+ # self.Pi[state, np.argmax(self.Q[state])] = (
106
+ # 1 - self.epsilon + self.epsilon / self.n_actions
107
+ # )
108
+
109
+ def update_off_policy(self, episode_hist):
110
+ G, W = 0.0, 1.0
111
  for t in range(len(episode_hist) - 1, -1, -1):
112
  state, action, reward = episode_hist[t]
113
  # Updating the expected return
114
  G = self.gamma * G + reward
115
+ self.C[state, action] = self.C[state, action] + W
116
+ self.Q[state, action] = self.Q[state, action] + (
117
+ W / self.C[state, action]
118
+ ) * (G - self.Q[state, action])
119
+ # Updating the target policy to be greedy with respect to the current Q
120
+ greedy_action = np.argmax(self.Q[state])
121
+ self.Pi[state] = np.zeros(self.n_actions)
122
+ self.Pi[state, greedy_action] = 1.0
123
+ # If the greedy action is not the action taken by the behavior policy, then break
124
+ if action != greedy_action:
125
+ break
126
+ W = W * (1.0 / self.Pi_behaviour[state, action])
127
+
128
+ # Update the behavior policy such that it has coverage of the target policy
129
+ self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
130
 
131
  def train(
132
  self,
133
  n_train_episodes=2000,
134
  test_every=100,
 
135
  log_wandb=False,
136
  save_best=True,
137
  save_best_dir=None,
 
139
  **kwargs,
140
  ):
141
  print(f"Training agent for {n_train_episodes} episodes...")
 
142
 
143
  (
144
  train_running_success_rate,
 
154
  "avg_ep_len": avg_ep_len,
155
  }
156
 
157
+ update_func = getattr(self, f"update_{self.update_type}")
158
 
159
  tqrange = tqdm(range(n_train_episodes))
160
  tqrange.set_description("Training")
 
163
  self.wandb_log_img(episode=None)
164
 
165
  for e in tqrange:
166
+ policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
167
+ episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
168
  rewards = [x[2] for x in episode_hist]
169
  total_reward, avg_reward = sum(rewards), np.mean(rewards)
170
 
 
185
  }
186
  tqrange.set_postfix(stats)
187
 
188
+ # Test the agent every test_every episodes
189
+ if test_every > 0 and e % test_every == 0:
190
+ # For off policy, self.Pi is the target policy. For on policy, self.Pi is the soft policy
191
  test_success_rate = self.test(verbose=False, **kwargs)
192
  if log_wandb:
193
  self.wandb_log_img(episode=e)
run.py CHANGED
@@ -39,8 +39,8 @@ def main():
39
  parser.add_argument(
40
  "--max_steps",
41
  type=int,
42
- default=200,
43
- help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
44
  )
45
 
46
  ### Agent parameters
@@ -68,9 +68,9 @@ def main():
68
  parser.add_argument(
69
  "--update_type",
70
  type=str,
71
- choices=["first_visit", "every_visit"],
72
- default="first_visit",
73
- help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
74
  )
75
 
76
  ### Environment parameters
@@ -159,7 +159,6 @@ def main():
159
  test_every=args.test_every,
160
  n_test_episodes=args.n_test_episodes,
161
  max_steps=args.max_steps,
162
- update_type=args.update_type,
163
  log_wandb=args.wandb_project is not None,
164
  save_best=True,
165
  save_best_dir=args.save_dir,
 
39
  parser.add_argument(
40
  "--max_steps",
41
  type=int,
42
+ default=None,
43
+ help="The maximum number of steps per episode before the episode is forced to end. If not provided, defaults to the number of states in the environment. (default: None)",
44
  )
45
 
46
  ### Agent parameters
 
68
  parser.add_argument(
69
  "--update_type",
70
  type=str,
71
+ choices=["on_policy", "off_policy"],
72
+ default="off_policy",
73
+ help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
74
  )
75
 
76
  ### Environment parameters
 
159
  test_every=args.test_every,
160
  n_test_episodes=args.n_test_episodes,
161
  max_steps=args.max_steps,
 
162
  log_wandb=args.wandb_project is not None,
163
  save_best=True,
164
  save_best_dir=args.save_dir,
test_params.py CHANGED
@@ -9,13 +9,13 @@ parser = argparse.ArgumentParser(description="Run parameter tests for MC agent")
9
  parser.add_argument(
10
  "--env",
11
  type=str,
12
- default="Taxi-v3",
13
  help="environment to run",
14
  )
15
  parser.add_argument(
16
  "--num_tests",
17
  type=int,
18
- default=25,
19
  help="number of tests to run for each parameter combination",
20
  )
21
  parser.add_argument(
@@ -31,31 +31,35 @@ env, num_tests, wandb_project = args.env, args.num_tests, args.wandb_project
31
  agent = "MCAgent"
32
 
33
  vals_update_type = [
34
- "first_visit"
 
35
  ] # Note: Every visit takes too long due to these environment's reward structure
36
- vals_gamma = [1.0, 0.98, 0.96, 0.94]
37
  vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
38
- # vals_gamma = [1.0]
39
  # vals_epsilon = [0.5]
40
 
 
 
41
  if env == "CliffWalking-v0":
42
  n_train_episodes = 2500
43
- max_steps = 200
44
  elif env == "FrozenLake-v1":
45
- n_train_episodes = 5000
46
- max_steps = 200
47
  elif env == "Taxi-v3":
48
  n_train_episodes = 10000
49
- max_steps = 500
50
  else:
51
  raise ValueError(f"Unsupported environment: {env}")
52
 
53
 
54
  def run_test(args):
55
  command = f"python3 run.py --train --agent {agent} --env {env}"
56
- command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
57
- command += f" --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]}"
58
- command += f" --run_name_suffix {args[3]}"
 
59
  if wandb_project is not None:
60
  command += f" --wandb_project {wandb_project}"
61
  command += " --no_save"
@@ -67,7 +71,28 @@ with multiprocessing.Pool(8) as p:
67
  for update_type in vals_update_type:
68
  for gamma in vals_gamma:
69
  for eps in vals_epsilon:
70
- tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  random.shuffle(tests)
72
 
73
  p.map(run_test, tests)
 
9
  parser.add_argument(
10
  "--env",
11
  type=str,
12
+ default="FrozenLake-v1",
13
  help="environment to run",
14
  )
15
  parser.add_argument(
16
  "--num_tests",
17
  type=int,
18
+ default=10,
19
  help="number of tests to run for each parameter combination",
20
  )
21
  parser.add_argument(
 
31
  agent = "MCAgent"
32
 
33
  vals_update_type = [
34
+ # "on_policy",
35
+ "off_policy",
36
  ] # Note: Every visit takes too long due to these environment's reward structure
37
+ # vals_gamma = [1.0, 0.98, 0.96, 0.94]
38
  vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
39
+ vals_gamma = [1.0]
40
  # vals_epsilon = [0.5]
41
 
42
+ vals_size = [8, 16, 32, 64]
43
+
44
  if env == "CliffWalking-v0":
45
  n_train_episodes = 2500
46
+ # max_steps = 200
47
  elif env == "FrozenLake-v1":
48
+ n_train_episodes = 25000
49
+ # max_steps = 200
50
  elif env == "Taxi-v3":
51
  n_train_episodes = 10000
52
+ # max_steps = 500
53
  else:
54
  raise ValueError(f"Unsupported environment: {env}")
55
 
56
 
57
  def run_test(args):
58
  command = f"python3 run.py --train --agent {agent} --env {env}"
59
+ # command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
60
+ command += f" --n_train_episodes {n_train_episodes}"
61
+ for k, v in args.items():
62
+ command += f" --{k} {v}"
63
  if wandb_project is not None:
64
  command += f" --wandb_project {wandb_project}"
65
  command += " --no_save"
 
71
  for update_type in vals_update_type:
72
  for gamma in vals_gamma:
73
  for eps in vals_epsilon:
74
+ if env == "FrozenLake-v1":
75
+ for size in vals_size:
76
+ tests.extend(
77
+ {
78
+ "gamma": gamma,
79
+ "epsilon": eps,
80
+ "update_type": update_type,
81
+ "size": size,
82
+ "run_name_suffix": i,
83
+ }
84
+ for i in range(num_tests)
85
+ )
86
+ else:
87
+ tests.extend(
88
+ {
89
+ "gamma": gamma,
90
+ "epsilon": eps,
91
+ "update_type": update_type,
92
+ "run_name_suffix": i,
93
+ }
94
+ for i in range(num_tests)
95
+ )
96
  random.shuffle(tests)
97
 
98
  p.map(run_test, tests)