Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

Andrei Cozma commited on Apr 24, 2023

Commit

6a48762

1 Parent(s): 50efa30

Updates

Browse files

Files changed (4) hide show

AgentBase.py +14 -8
DPAgent.py +1 -1
MCAgent.py +83 -36
run.py +3 -4

AgentBase.py CHANGED Viewed

@@ -62,22 +62,28 @@ class AgentBase:
         print(f"- n_states: {self.n_states}")
         print(f"- n_actions: {self.n_actions}")
-    def choose_action(self, state, greedy=False, **kwargs):
         """
         Sample an action from the policy.
         Also allows the ability to override the epsilon value (for the purpose of the demo)
         :param state: The current state
         :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
         :return: The sampled action
         """
         # If greedy is True, always return the greedy action
-        greedy_action = np.argmax(self.Pi[state])
         if greedy or self.epsilon_override == 0.0:
             return greedy_action
         # Otherwise, sample an action from the soft policy (epsilon-greedy)
         if self.epsilon_override is None:
-            return np.random.choice(self.n_actions, p=self.Pi[state])
         # If we ever want to manually override the epsilon value, it happens here
         return np.random.choice(
@@ -85,7 +91,7 @@ class AgentBase:
             p=[1.0 - self.epsilon_override, self.epsilon_override],
         )
-    def generate_episode(self, max_steps=500, render=False, **kwargs):
         state, _ = self.env.reset()
         # action = self.choose_action(state, **kwargs)
         episode_hist, solved, done = [], False, False
@@ -97,7 +103,7 @@ class AgentBase:
             # Render the environment if needed
             rgb_array = self.env.render() if render else None
             # Sample the next action from the policy
-            action = self.choose_action(state, **kwargs)
             # Keeping track of the trajectory
             episode_hist.append((state, action, None))
             # Take the action and observe the reward and next state
@@ -134,10 +140,10 @@ class AgentBase:
         rgb_array = self.env.render() if render else None
         yield episode_hist, solved, rgb_array
-    def run_episode(self, max_steps=500, render=False, **kwargs):
         # Run the generator until the end
         episode_hist, solved, rgb_array = list(
-            self.generate_episode(max_steps, render, **kwargs)
         )[-1]
         return episode_hist, solved, rgb_array
@@ -146,7 +152,7 @@ class AgentBase:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
-            _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
             num_successes += solved
             if verbose:
                 word = "reached" if solved else "did not reach"

         print(f"- n_states: {self.n_states}")
         print(f"- n_actions: {self.n_actions}")
+    def choose_action(self, policy, state, greedy=False, **kwargs):
         """
         Sample an action from the policy.
         Also allows the ability to override the epsilon value (for the purpose of the demo)
         :param state: The current state
+        :param policy: The policy to sample from. Must be of shape (n_states, n_actions)
         :param greedy: If True, always return the greedy action (argmax of the policy at the current state)
         :return: The sampled action
         """
+        assert policy.shape == (self.n_states, self.n_actions), (
+            f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
+            f"Got {policy.shape}."
+        )
         # If greedy is True, always return the greedy action
+        greedy_action = np.argmax(policy[state])
         if greedy or self.epsilon_override == 0.0:
             return greedy_action
         # Otherwise, sample an action from the soft policy (epsilon-greedy)
         if self.epsilon_override is None:
+            return np.random.choice(self.n_actions, p=policy[state])
         # If we ever want to manually override the epsilon value, it happens here
         return np.random.choice(
             p=[1.0 - self.epsilon_override, self.epsilon_override],
         )
+    def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
         state, _ = self.env.reset()
         # action = self.choose_action(state, **kwargs)
         episode_hist, solved, done = [], False, False
             # Render the environment if needed
             rgb_array = self.env.render() if render else None
             # Sample the next action from the policy
+            action = self.choose_action(policy, state, **kwargs)
             # Keeping track of the trajectory
             episode_hist.append((state, action, None))
             # Take the action and observe the reward and next state
         rgb_array = self.env.render() if render else None
         yield episode_hist, solved, rgb_array
+    def run_episode(self, policy, max_steps=500, render=False, **kwargs):
         # Run the generator until the end
         episode_hist, solved, rgb_array = list(
+            self.generate_episode(policy, max_steps, render, **kwargs)
         )[-1]
         return episode_hist, solved, rgb_array
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0
         for e in range(n_test_episodes):
+            _, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
             num_successes += solved
             if verbose:
                 word = "reached" if solved else "did not reach"

DPAgent.py CHANGED Viewed

@@ -103,7 +103,7 @@ if __name__ == "__main__":
     state, _ = env.reset()
     done = False
     while not done:
-        action = dp.choose_action(state)
         state, reward, done, _, _ = env.step(action)
         env.render()

     state, _ = env.reset()
     done = False
     while not done:
+        action = dp.choose_action(dp.Pi, state)
         state, reward, done, _, _ = env.step(action)
         env.render()

MCAgent.py CHANGED Viewed

@@ -5,35 +5,60 @@ from AgentBase import AgentBase
 class MCAgent(AgentBase):
-    def __init__(self, /, **kwargs):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
         self.initialize()
     def initialize(self):
         print("Resetting all state variables...")
         # The Q-Table holds the current expected return for each state-action pair
-        self.Q = np.zeros((self.n_states, self.n_actions))
-        # R keeps track of all the returns that have been observed for each state-action pair to update Q
-        self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
         # An arbitrary e-greedy policy:
         # With probability epsilon, sample an action uniformly at random
-        self.Pi = np.full(
-            (self.n_states, self.n_actions), self.epsilon / self.n_actions
-        )
         # For the initial policy, we randomly select a greedy action for each state
-        self.Pi[
             np.arange(self.n_states),
-            np.random.randint(self.n_actions, size=self.n_states),
         ] = (
-            1 - self.epsilon + self.epsilon / self.n_actions
         )
-        print("=" * 80)
-        print("Initial policy:")
-        print(self.Pi)
-        print("=" * 80)
-    def update_first_visit(self, episode_hist):
-        G = 0
         # For each step of the episode, in reverse order
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
@@ -52,30 +77,51 @@ class MCAgent(AgentBase):
                     1 - self.epsilon + self.epsilon / self.n_actions
                 )
-    def update_every_visit(self, episode_hist):
-        G = 0
-        # Backward pass through the trajectory
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
             # Updating the expected return
             G = self.gamma * G + reward
-            # Every-visit MC method:
-            # Updating the expected return and policy for every visit to this state-action pair
-            self.R[state][action].append(G)
-            self.Q[state, action] = np.mean(self.R[state][action])
-            # Updating the epsilon-greedy policy.
-            # With probability epsilon, sample an action uniformly at random
-            self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
-            # The greedy action receives the remaining probability mass
-            self.Pi[state, np.argmax(self.Q[state])] = (
-                1 - self.epsilon + self.epsilon / self.n_actions
-            )
     def train(
         self,
         n_train_episodes=2000,
         test_every=100,
-        update_type="first_visit",
         log_wandb=False,
         save_best=True,
         save_best_dir=None,
@@ -83,7 +129,6 @@ class MCAgent(AgentBase):
         **kwargs,
     ):
         print(f"Training agent for {n_train_episodes} episodes...")
-        self.run_name = f"{self.run_name}_{update_type}"
         (
             train_running_success_rate,
@@ -99,7 +144,7 @@ class MCAgent(AgentBase):
             "avg_ep_len": avg_ep_len,
         }
-        update_func = getattr(self, f"update_{update_type}")
         tqrange = tqdm(range(n_train_episodes))
         tqrange.set_description("Training")
@@ -108,7 +153,8 @@ class MCAgent(AgentBase):
             self.wandb_log_img(episode=None)
         for e in tqrange:
-            episode_hist, solved, _ = self.run_episode(**kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
@@ -129,8 +175,9 @@ class MCAgent(AgentBase):
             }
             tqrange.set_postfix(stats)
-            # Test the agent every test_every episodes with the greedy policy (by default)
-            if e % test_every == 0:
                 test_success_rate = self.test(verbose=False, **kwargs)
                 if log_wandb:
                     self.wandb_log_img(episode=e)

 class MCAgent(AgentBase):
+    def __init__(
+        self, /, update_type="on-policy", **kwargs  # "on-policy" or "off-policy
+    ):
         super().__init__(run_name=self.__class__.__name__, **kwargs)
+        self.update_type = update_type
+        self.run_name = f"{self.run_name}_{self.update_type}"
         self.initialize()
     def initialize(self):
         print("Resetting all state variables...")
         # The Q-Table holds the current expected return for each state-action pair
+        self.Q = np.random.rand(self.n_states, self.n_actions)
+        # self.Q = np.zeros((self.n_states, self.n_actions))
+        if self.update_type.startswith("on_policy"):
+            # For On-Policy update type:
+            # R keeps track of all the returns that have been observed for each state-action pair to update Q
+            self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
+            # An arbitrary e-greedy policy:
+            self.Pi = self.create_soft_policy()
+        elif self.update_type.startswith("off_policy"):
+            # For Off-Policy update type:
+            self.C = np.zeros((self.n_states, self.n_actions))
+            # Target policy is greedy with respect to the current Q
+            self.Pi = np.zeros((self.n_states, self.n_actions))
+            self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
+            # Behavior policy is e-greedy with respect to the current Q
+            self.Pi_behaviour = self.create_soft_policy(random=False)
+        else:
+            raise ValueError(
+                f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
+            )
+        print("=" * 80)
+        print("Initial policy:")
+        print(self.Pi)
+        print("=" * 80)
+    def create_soft_policy(self, random=True):
         # An arbitrary e-greedy policy:
         # With probability epsilon, sample an action uniformly at random
+        Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
         # For the initial policy, we randomly select a greedy action for each state
+        Pi[
             np.arange(self.n_states),
+            np.random.randint(self.n_actions, size=self.n_states)
+            if random
+            else np.argmax(self.Q, axis=1),
         ] = (
+            1.0 - self.epsilon + self.epsilon / self.n_actions
         )
+        return Pi
+    def update_on_policy(self, episode_hist):
+        G = 0.0
         # For each step of the episode, in reverse order
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
                     1 - self.epsilon + self.epsilon / self.n_actions
                 )
+    # def update_every_visit(self, episode_hist):
+    #     G = 0
+    #     # Backward pass through the trajectory
+    #     for t in range(len(episode_hist) - 1, -1, -1):
+    #         state, action, reward = episode_hist[t]
+    #         # Updating the expected return
+    #         G = self.gamma * G + reward
+    #         # Every-visit MC method:
+    #         # Updating the expected return and policy for every visit to this state-action pair
+    #         self.R[state][action].append(G)
+    #         self.Q[state, action] = np.mean(self.R[state][action])
+    #         # Updating the epsilon-greedy policy.
+    #         # With probability epsilon, sample an action uniformly at random
+    #         self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
+    #         # The greedy action receives the remaining probability mass
+    #         self.Pi[state, np.argmax(self.Q[state])] = (
+    #             1 - self.epsilon + self.epsilon / self.n_actions
+    #         )
+    def update_off_policy(self, episode_hist):
+        G, W = 0.0, 1.0
         for t in range(len(episode_hist) - 1, -1, -1):
             state, action, reward = episode_hist[t]
             # Updating the expected return
             G = self.gamma * G + reward
+            self.C[state, action] = self.C[state, action] + W
+            self.Q[state, action] = self.Q[state, action] + (
+                W / self.C[state, action]
+            ) * (G - self.Q[state, action])
+            # Updating the target policy to be greedy with respect to the current Q
+            greedy_action = np.argmax(self.Q[state])
+            self.Pi[state] = np.zeros(self.n_actions)
+            self.Pi[state, greedy_action] = 1.0
+            # if At != At*, then break
+            if action != greedy_action:
+                break
+            W = W * (1.0 / self.Pi_behaviour[state, action])
+        # Update the behavior policy such that it has coverage of the target policy
+        self.Pi_behaviour = self.create_soft_policy(random=False)
     def train(
         self,
         n_train_episodes=2000,
         test_every=100,
         log_wandb=False,
         save_best=True,
         save_best_dir=None,
         **kwargs,
     ):
         print(f"Training agent for {n_train_episodes} episodes...")
         (
             train_running_success_rate,
             "avg_ep_len": avg_ep_len,
         }
+        update_func = getattr(self, f"update_{self.update_type}")
         tqrange = tqdm(range(n_train_episodes))
         tqrange.set_description("Training")
             self.wandb_log_img(episode=None)
         for e in tqrange:
+            policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
+            episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
             rewards = [x[2] for x in episode_hist]
             total_reward, avg_reward = sum(rewards), np.mean(rewards)
             }
             tqrange.set_postfix(stats)
+            # Test the agent every test_every episodes
+            if test_every > 0 and e % test_every == 0:
+                # For off policy, self.Pi is the target policy. For on policy, self.Pi is the soft policy
                 test_success_rate = self.test(verbose=False, **kwargs)
                 if log_wandb:
                     self.wandb_log_img(episode=e)

run.py CHANGED Viewed

@@ -68,9 +68,9 @@ def main():
     parser.add_argument(
         "--update_type",
         type=str,
-        choices=["first_visit", "every_visit"],
-        default="first_visit",
-        help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
     )
     ### Environment parameters
@@ -159,7 +159,6 @@ def main():
                 test_every=args.test_every,
                 n_test_episodes=args.n_test_episodes,
                 max_steps=args.max_steps,
-                update_type=args.update_type,
                 log_wandb=args.wandb_project is not None,
                 save_best=True,
                 save_best_dir=args.save_dir,

     parser.add_argument(
         "--update_type",
         type=str,
+        choices=["on_policy", "off_policy"],
+        default="off_policy",
+        help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
     )
     ### Environment parameters
                 test_every=args.test_every,
                 n_test_episodes=args.n_test_episodes,
                 max_steps=args.max_steps,
                 log_wandb=args.wandb_project is not None,
                 save_best=True,
                 save_best_dir=args.save_dir,