Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

lharri73 commited on Apr 25, 2023

Commit

2a6763e

2 Parent(s): e0c3c75 120dc90

Merge branch 'main' of github.com:andreicozma1/CS581-Algorithms-Project

Browse files

Files changed (5) hide show

AgentBase.py +19 -9
DPAgent.py +1 -1
MCAgent.py +95 -38
run.py +5 -6
test_params.py +38 -13

AgentBase.py CHANGED Viewed

@@ -62,22 +62,28 @@ class AgentBase:
         print(f"- n_states: {self.n_states}")
         print(f"- n_actions: {self.n_actions}")
-    def choose_action(self, state, greedy=False, **kwargs):
         """
         Sample an action from the policy.
         Also allows the ability to override the epsilon value (for the purpose of the demo)
         :param state: The current state
         :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
         :return: The sampled action
         """
         # If greedy is True, always return the greedy action
-        greedy_action = np.argmax(self.Pi[state])
         if greedy or self.epsilon_override == 0.0:
             return greedy_action
         # Otherwise, sample an action from the soft policy (epsilon-greedy)
         if self.epsilon_override is None:
-            return np.random.choice(self.n_actions, p=self.Pi[state])
         # If we ever want to manually override the epsilon value, it happens here
         return np.random.choice(
@@ -85,9 +91,13 @@ class AgentBase:
             p=[1.0 - self.epsilon_override, self.epsilon_override],
         )
-    def generate_episode(self, max_steps=500, render=False, **kwargs):
         state, _ = self.env.reset()
-        # action = self.choose_action(state, **kwargs)
         episode_hist, solved, done = [], False, False
         rgb_array = self.env.render() if render else None
@@ -97,7 +107,7 @@ class AgentBase:
             # Render the environment if needed
             rgb_array = self.env.render() if render else None
             # Sample the next action from the policy
-            action = self.choose_action(state, **kwargs)
             # Keeping track of the trajectory
             episode_hist.append((state, action, None))
             # Take the action and observe the reward and next state
@@ -134,10 +144,10 @@ class AgentBase:
         rgb_array = self.env.render() if render else None
         yield episode_hist, solved, rgb_array
-    def run_episode(self, max_steps=500, render=False, **kwargs):
         # Run the generator until the end
         episode_hist, solved, rgb_array = list(
-            self.generate_episode(max_steps, render, **kwargs)
         )[-1]
         return episode_hist, solved, rgb_array
@@ -146,7 +156,7 @@ class AgentBase:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
-            _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
             num_successes += solved
             if verbose:
                 word = "reached" if solved else "did not reach"

         print(f"- n_states: {self.n_states}")
         print(f"- n_actions: {self.n_actions}")
+    def choose_action(self, policy, state, greedy=False, **kwargs):
         """
         Sample an action from the policy.
         Also allows the ability to override the epsilon value (for the purpose of the demo)
         :param state: The current state
+        :param policy: The policy to sample from. Must be of shape (n_states, n_actions)
         :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
         :return: The sampled action
         """
+        assert policy.shape == (self.n_states, self.n_actions), (
+            f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
+            f"Got {policy.shape}."
+        )
         # If greedy is True, always return the greedy action
+        greedy_action = np.argmax(policy[state])
         if greedy or self.epsilon_override == 0.0:
             return greedy_action
         # Otherwise, sample an action from the soft policy (epsilon-greedy)
         if self.epsilon_override is None:
+            return np.random.choice(self.n_actions, p=policy[state])
         # If we ever want to manually override the epsilon value, it happens here
         return np.random.choice(
             p=[1.0 - self.epsilon_override, self.epsilon_override],
         )
+    def generate_episode(self, policy, max_steps=None, render=False, **kwargs):
+        if max_steps is None:
+            # If max_steps is not specified, we use a rough estimate of
+            # the maximum number of steps it should take to solve the environment
+            max_steps = self.n_states * self.n_actions
         state, _ = self.env.reset()
         episode_hist, solved, done = [], False, False
         rgb_array = self.env.render() if render else None
             # Render the environment if needed
             rgb_array = self.env.render() if render else None
             # Sample the next action from the policy
+            action = self.choose_action(policy, state, **kwargs)
             # Keeping track of the trajectory
             episode_hist.append((state, action, None))
             # Take the action and observe the reward and next state
         rgb_array = self.env.render() if render else None
         yield episode_hist, solved, rgb_array
+    def run_episode(self, policy, max_steps=None, render=False, **kwargs):
         # Run the generator until the end
         episode_hist, solved, rgb_array = list(
+            self.generate_episode(policy, max_steps, render, **kwargs)
         )[-1]
         return episode_hist, solved, rgb_array
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
+            _, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
             num_successes += solved
             if verbose:
                 word = "reached" if solved else "did not reach"

DPAgent.py CHANGED Viewed

@@ -125,7 +125,7 @@ if __name__ == "__main__":
     state, _ = env.reset()
     done = False
     while not done:
-        action = dp.choose_action(state, greedy=True)
         state, reward, done, _, _ = env.step(action)
         s = env.render()
         print(s)

     state, _ = env.reset()
     done = False
     while not done:
+        action = dp.choose_action(dp.Pi, state)
         state, reward, done, _, _ = env.step(action)
         s = env.render()
         print(s)

MCAgent.py CHANGED Viewed

@@ -5,35 +5,70 @@ from AgentBase import AgentBase
 class MCAgent(AgentBase):
-    def __init__(self, /, **kwargs):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
         self.initialize()
     def initialize(self):
         print("Resetting all state variables...")
         # The Q-Table holds the current expected return for each state-action pair
-        self.Q = np.zeros((self.n_states, self.n_actions))
-        # R keeps track of all the returns that have been observed for each state-action pair to update Q
-        self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
-        # An arbitrary e-greedy policy:
-        # With probability epsilon, sample an action uniformly at random
-        self.Pi = np.full(
-            (self.n_states, self.n_actions), self.epsilon / self.n_actions
-        )
-        # For the initial policy, we randomly select a greedy action for each state
-        self.Pi[
-            np.arange(self.n_states),
-            np.random.randint(self.n_actions, size=self.n_states),
-        ] = (
-            1 - self.epsilon + self.epsilon / self.n_actions
-        )
         print("=" * 80)
         print("Initial policy:")
         print(self.Pi)
         print("=" * 80)
-    def update_first_visit(self, episode_hist):
-        G = 0
         # For each step of the episode, in reverse order
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
@@ -52,30 +87,51 @@ class MCAgent(AgentBase):
                     1 - self.epsilon + self.epsilon / self.n_actions
                 )
-    def update_every_visit(self, episode_hist):
-        G = 0
-        # Backward pass through the trajectory
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
             # Updating the expected return
             G = self.gamma * G + reward
-            # Every-visit MC method:
-            # Updating the expected return and policy for every visit to this state-action pair
-            self.R[state][action].append(G)
-            self.Q[state, action] = np.mean(self.R[state][action])
-            # Updating the epsilon-greedy policy.
-            # With probability epsilon, sample an action uniformly at random
-            self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
-            # The greedy action receives the remaining probability mass
-            self.Pi[state, np.argmax(self.Q[state])] = (
-                1 - self.epsilon + self.epsilon / self.n_actions
-            )
     def train(
         self,
         n_train_episodes=2000,
         test_every=100,
-        update_type="first_visit",
         log_wandb=False,
         save_best=True,
         save_best_dir=None,
@@ -83,7 +139,6 @@ class MCAgent(AgentBase):
         **kwargs,
     ):
         print(f"Training agent for {n_train_episodes} episodes...")
-        self.run_name = f"{self.run_name}_{update_type}"
         (
             train_running_success_rate,
@@ -99,7 +154,7 @@ class MCAgent(AgentBase):
             "avg_ep_len": avg_ep_len,
         }
-        update_func = getattr(self, f"update_{update_type}")
         tqrange = tqdm(range(n_train_episodes))
         tqrange.set_description("Training")
@@ -108,7 +163,8 @@ class MCAgent(AgentBase):
             self.wandb_log_img(episode=None)
         for e in tqrange:
-            episode_hist, solved, _ = self.run_episode(**kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
@@ -129,8 +185,9 @@ class MCAgent(AgentBase):
             }
             tqrange.set_postfix(stats)
-            # Test the agent every test_every episodes with the greedy policy (by default)
-            if e % test_every == 0:
                 test_success_rate = self.test(verbose=False, **kwargs)
                 if log_wandb:
                     self.wandb_log_img(episode=e)

 class MCAgent(AgentBase):
+    def __init__(
+        self, /, update_type="on-policy", **kwargs  # "on-policy" or "off-policy
+    ):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
+        self.update_type = update_type
+        self.run_name = f"{self.run_name}_{self.update_type}"
         self.initialize()
     def initialize(self):
         print("Resetting all state variables...")
         # The Q-Table holds the current expected return for each state-action pair
+        # random uniform initialization
+        self.Q = np.random.uniform(-1, 1, size=(self.n_states, self.n_actions))
+        # other alternatives:
+        # self.Q = np.zeros((self.n_states, self.n_actions))
+        # self.Q = np.random.rand(self.n_states, self.n_actions)
+        # self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
+        if self.update_type.startswith("on_policy"):
+            # For On-Policy update type:
+            # R keeps track of all the returns that have been observed for each state-action pair to update Q
+            self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
+            # An arbitrary e-greedy policy:
+            self.Pi = self.create_soft_policy()
+        elif self.update_type.startswith("off_policy"):
+            # For Off-Policy update type:
+            self.C = np.zeros((self.n_states, self.n_actions))
+            # Target policy is greedy with respect to the current Q (ties broken consistently)
+            self.Pi = np.zeros((self.n_states, self.n_actions))
+            self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
+            # Behavior policy is e-greedy with respect to the current Q
+            self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
+        else:
+            raise ValueError(
+                f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
+            )
         print("=" * 80)
         print("Initial policy:")
         print(self.Pi)
         print("=" * 80)
+    def create_soft_policy(self, coverage_policy=None):
+        """
+        Create a soft policy (epsilon-greedy).
+        If coverage_policy is None, the soft policy is initialized randomly.
+        Otherwise, the soft policy is e-greedy with respect to the coverage policy. (useful for off-policy)
+        """
+        # With probability epsilon, sample an action uniformly at random
+        Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
+        # The greedy action receives the remaining probability mass
+        # If coverage_policy is not provided, the greedy action is sampled randomly
+        # Otherwise we give the remaining probability mass according to the coverage policy
+        Pi[
+            np.arange(self.n_states),
+            np.random.randint(self.n_actions, size=self.n_states)
+            if coverage_policy is None
+            else np.argmax(coverage_policy, axis=1),
+        ] = (
+            1.0 - self.epsilon + self.epsilon / self.n_actions
+        )
+        return Pi
+    def update_on_policy(self, episode_hist):
+        G = 0.0
         # For each step of the episode, in reverse order
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
                     1 - self.epsilon + self.epsilon / self.n_actions
                 )
+    # def update_every_visit(self, episode_hist):
+    #     G = 0
+    #     # Backward pass through the trajectory
+    #     for t in range(len(episode_hist) - 1, -1, -1):
+    #         state, action, reward = episode_hist[t]
+    #         # Updating the expected return
+    #         G = self.gamma * G + reward
+    #         # Every-visit MC method:
+    #         # Updating the expected return and policy for every visit to this state-action pair
+    #         self.R[state][action].append(G)
+    #         self.Q[state, action] = np.mean(self.R[state][action])
+    #         # Updating the epsilon-greedy policy.
+    #         # With probability epsilon, sample an action uniformly at random
+    #         self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
+    #         # The greedy action receives the remaining probability mass
+    #         self.Pi[state, np.argmax(self.Q[state])] = (
+    #             1 - self.epsilon + self.epsilon / self.n_actions
+    #         )
+    def update_off_policy(self, episode_hist):
+        G, W = 0.0, 1.0
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
             # Updating the expected return
             G = self.gamma * G + reward
+            self.C[state, action] = self.C[state, action] + W
+            self.Q[state, action] = self.Q[state, action] + (
+                W / self.C[state, action]
+            ) * (G - self.Q[state, action])
+            # Updating the target policy to be greedy with respect to the current Q
+            greedy_action = np.argmax(self.Q[state])
+            self.Pi[state] = np.zeros(self.n_actions)
+            self.Pi[state, greedy_action] = 1.0
+            # If the greedy action is not the action taken by the behavior policy, then break
+            if action != greedy_action:
+                break
+            W = W * (1.0 / self.Pi_behaviour[state, action])
+        # Update the behavior policy such that it has coverage of the target policy
+        self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
     def train(
         self,
         n_train_episodes=2000,
         test_every=100,
         log_wandb=False,
         save_best=True,
         save_best_dir=None,
         **kwargs,
     ):
         print(f"Training agent for {n_train_episodes} episodes...")
         (
             train_running_success_rate,
             "avg_ep_len": avg_ep_len,
         }
+        update_func = getattr(self, f"update_{self.update_type}")
         tqrange = tqdm(range(n_train_episodes))
         tqrange.set_description("Training")
             self.wandb_log_img(episode=None)
         for e in tqrange:
+            policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
+            episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
             }
             tqrange.set_postfix(stats)
+            # Test the agent every test_every episodes
+            if test_every > 0 and e % test_every == 0:
+                # For off policy, self.Pi is the target policy. For on policy, self.Pi is the soft policy
                 test_success_rate = self.test(verbose=False, **kwargs)
                 if log_wandb:
                     self.wandb_log_img(episode=e)

run.py CHANGED Viewed

@@ -39,8 +39,8 @@ def main():
     parser.add_argument(
         "--max_steps",
         type=int,
-        default=200,
-        help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
     )
     ### Agent parameters
@@ -68,9 +68,9 @@ def main():
     parser.add_argument(
         "--update_type",
         type=str,
-        choices=["first_visit", "every_visit"],
-        default="first_visit",
-        help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
     )
     ### Environment parameters
@@ -159,7 +159,6 @@ def main():
                 test_every=args.test_every,
                 n_test_episodes=args.n_test_episodes,
                 max_steps=args.max_steps,
-                update_type=args.update_type,
                 log_wandb=args.wandb_project is not None,
                 save_best=True,
                 save_best_dir=args.save_dir,

     parser.add_argument(
         "--max_steps",
         type=int,
+        default=None,
+        help="The maximum number of steps per episode before the episode is forced to end. If not provided, defaults to the number of states in the environment. (default: None)",
     )
     ### Agent parameters
     parser.add_argument(
         "--update_type",
         type=str,
+        choices=["on_policy", "off_policy"],
+        default="off_policy",
+        help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
     )
     ### Environment parameters
                 test_every=args.test_every,
                 n_test_episodes=args.n_test_episodes,
                 max_steps=args.max_steps,
                 log_wandb=args.wandb_project is not None,
                 save_best=True,
                 save_best_dir=args.save_dir,

test_params.py CHANGED Viewed

@@ -9,13 +9,13 @@ parser = argparse.ArgumentParser(description="Run parameter tests for MC agent")
 parser.add_argument(
     "--env",
     type=str,
-    default="Taxi-v3",
     help="environment to run",
 )
 parser.add_argument(
     "--num_tests",
     type=int,
-    default=25,
     help="number of tests to run for each parameter combination",
 )
 parser.add_argument(
@@ -31,31 +31,35 @@ env, num_tests, wandb_project = args.env, args.num_tests, args.wandb_project
 agent = "MCAgent"
 vals_update_type = [
-    "first_visit"
 ]  # Note: Every visit takes too long due to these environment's reward structure
-vals_gamma = [1.0, 0.98, 0.96, 0.94]
 vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
-# vals_gamma = [1.0]
 # vals_epsilon = [0.5]
 if env == "CliffWalking-v0":
     n_train_episodes = 2500
-    max_steps = 200
 elif env == "FrozenLake-v1":
-    n_train_episodes = 5000
-    max_steps = 200
 elif env == "Taxi-v3":
     n_train_episodes = 10000
-    max_steps = 500
 else:
     raise ValueError(f"Unsupported environment: {env}")
 def run_test(args):
     command = f"python3 run.py --train --agent {agent} --env {env}"
-    command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
-    command += f" --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]}"
-    command += f" --run_name_suffix {args[3]}"
     if wandb_project is not None:
         command += f" --wandb_project {wandb_project}"
     command += " --no_save"
@@ -67,7 +71,28 @@ with multiprocessing.Pool(8) as p:
     for update_type in vals_update_type:
         for gamma in vals_gamma:
             for eps in vals_epsilon:
-                tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
     random.shuffle(tests)
     p.map(run_test, tests)

 parser.add_argument(
     "--env",
     type=str,
+    default="FrozenLake-v1",
     help="environment to run",
 )
 parser.add_argument(
     "--num_tests",
     type=int,
+    default=10,
     help="number of tests to run for each parameter combination",
 )
 parser.add_argument(
 agent = "MCAgent"
 vals_update_type = [
+    # "on_policy",
+    "off_policy",
 ]  # Note: Every visit takes too long due to these environment's reward structure
+# vals_gamma = [1.0, 0.98, 0.96, 0.94]
 vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
+vals_gamma = [1.0]
 # vals_epsilon = [0.5]
+vals_size = [8, 16, 32, 64]
 if env == "CliffWalking-v0":
     n_train_episodes = 2500
+    # max_steps = 200
 elif env == "FrozenLake-v1":
+    n_train_episodes = 25000
+    # max_steps = 200
 elif env == "Taxi-v3":
     n_train_episodes = 10000
+    # max_steps = 500
 else:
     raise ValueError(f"Unsupported environment: {env}")
 def run_test(args):
     command = f"python3 run.py --train --agent {agent} --env {env}"
+    # command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
+    command += f" --n_train_episodes {n_train_episodes}"
+    for k, v in args.items():
+        command += f" --{k} {v}"
     if wandb_project is not None:
         command += f" --wandb_project {wandb_project}"
     command += " --no_save"
     for update_type in vals_update_type:
         for gamma in vals_gamma:
             for eps in vals_epsilon:
+                if env == "FrozenLake-v1":
+                    for size in vals_size:
+                        tests.extend(
+                            {
+                                "gamma": gamma,
+                                "epsilon": eps,
+                                "update_type": update_type,
+                                "size": size,
+                                "run_name_suffix": i,
+                            }
+                            for i in range(num_tests)
+                        )
+                else:
+                    tests.extend(
+                        {
+                            "gamma": gamma,
+                            "epsilon": eps,
+                            "update_type": update_type,
+                            "run_name_suffix": i,
+                        }
+                        for i in range(num_tests)
+                    )
     random.shuffle(tests)
     p.map(run_test, tests)