Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

Andrei Cozma commited on Mar 11, 2023

Commit

f902143

1 Parent(s): a905cc7

Updates

Browse files

Files changed (5) hide show

scripts/ExpectedSarsaAgent.py +104 -0
scripts/MonteCarloAgent.py +51 -0
scripts/QLearningAgent.py +71 -0
scripts/SarsaAgent.py +76 -0
scripts/nStepTDAgent.py +120 -0

scripts/ExpectedSarsaAgent.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import numpy as np
+class ExpectedSarsaAgent:
+    def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
+        """
+        Constructor
+        Args:
+            epsilon: The degree of exploration
+            gamma: The discount factor
+            num_state: The number of states
+            num_actions: The number of actions
+            action_space: To call the random action
+        """
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.gamma = gamma
+        self.num_state = num_state
+        self.num_actions = num_actions
+        self.Q = np.zeros((self.num_state, self.num_actions))
+        self.action_space = action_space
+    def update(self, prev_state, next_state, reward, prev_action, next_action):
+        """
+        Update the action value function using the Expected SARSA update.
+        Q(S, A) = Q(S, A) + alpha(reward + (pi * Q(S_, A_) - Q(S, A))
+        Args:
+            prev_state: The previous state
+            next_state: The next state
+            reward: The reward for taking the respective action
+            prev_action: The previous action
+            next_action: The next action
+        Returns:
+            None
+        """
+        predict = self.Q[prev_state, prev_action]
+        expected_q = 0
+        q_max = np.max(self.Q[next_state, :])
+        greedy_actions = 0
+        for i in range(self.num_actions):
+            if self.Q[next_state][i] == q_max:
+                greedy_actions += 1
+        non_greedy_action_probability = self.epsilon / self.num_actions
+        greedy_action_probability = (
+            (1 - self.epsilon) / greedy_actions
+        ) + non_greedy_action_probability
+        for i in range(self.num_actions):
+            if self.Q[next_state][i] == q_max:
+                expected_q += self.Q[next_state][i] * greedy_action_probability
+            else:
+                expected_q += self.Q[next_state][i] * non_greedy_action_probability
+        target = reward + self.gamma * expected_q
+        self.Q[prev_state, prev_action] += self.alpha * (target - predict)
+episode = [
+    ["s1", "a1", -8],
+    ["s1", "a2", -16],
+    ["s2", "a1", 20],
+    ["s1", "a2", -10],
+    ["s2", "a1", None],
+]
+index_map = {
+    "s1": 0,
+    "s2": 1,
+    "a1": 0,
+    "a2": 1,
+}
+def main_r():
+    print("# ExpectedSarsaAgent.py")
+    agent = ExpectedSarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
+    print(agent.Q)
+    for i in range(len(episode) - 1):
+        print(f"# Step {i + 1}")
+        s, a, r = episode[i]
+        s2, a2, _ = episode[i + 1]
+        agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
+        print(agent.Q)
+def main_rn():
+    print("# ExpectedSarsaAgent.py")
+    agent = ExpectedSarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
+    print(agent.Q)
+    for i in range(len(episode) - 2):
+        print(f"# Step {i + 1}")
+        s, a, _ = episode[i]
+        s2, a2, r2 = episode[i + 1]
+        agent.update(index_map[s], index_map[s2], r2, index_map[a], index_map[a2])
+        print(agent.Q)
+if __name__ == "__main__":
+    main_r()
+    print()
+    main_rn()

scripts/MonteCarloAgent.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import numpy as np
+episodes = [
+    [["A", "a1", 3], ["A", "a2", 2], ["B", "a1", -4], ["A", "a1", 4], ["B", "a1", -3]],
+    [["B", "a1", -2], ["A", "a1", 3], ["B", "a2", -3]],
+]
+index_map = {
+    "states": {
+        "A": 0,
+        "B": 1,
+    },
+    "actions": {
+        "a1": 0,
+        "a2": 1,
+    },
+}
+def main_r():
+    print("# MonteCarloAgent.py")
+    alpha = 0.1
+    num_states = 2
+    v = np.zeros(num_states)
+    rets = {s: [] for s in index_map["states"].keys()}
+    for ep in episodes:
+        print("=" * 80)
+        g = 0
+        ep_len = len(ep)
+        print(f"# Episode: {ep} (steps: {ep_len}) G: {g}")
+        for t in range(ep_len - 1, -1, -1):
+            s, a, r = ep[t]
+            si = index_map["states"][s]
+            g = g + r
+            print(f"# Step {t + 1}:")
+            print(f"\ts: {s}, a: {a}, r: {r}")
+            print(f"\tG: {g}")
+            # unless st appears in the episode before time t
+            if s not in [x[0] for x in ep[:t]]:
+                rets[s].append(g)
+                v[si] = alpha * (sum(rets[s]) / len(rets[s]))
+                # v[si] = v[si] + alpha * (g - v[si])
+                print(f"\tV[{s}] = {v[si]}")
+if __name__ == "__main__":
+    main_r()

scripts/QLearningAgent.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import numpy as np
+class QLearningAgent:
+    def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
+        """
+        Constructor
+        Args:
+            epsilon: The degree of exploration
+            gamma: The discount factor
+            num_state: The number of states
+            num_actions: The number of actions
+            action_space: To call the random action
+        """
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.gamma = gamma
+        self.num_state = num_state
+        self.num_actions = num_actions
+        self.Q = np.zeros((self.num_state, self.num_actions))
+        self.action_space = action_space
+    def update(self, state, state2, reward, action, action2):
+        """
+        Update the action value function using the Q-Learning update.
+        Q(S, A) = Q(S, A) + alpha(reward + (gamma * Q(S_, A_) - Q(S, A))
+        Args:
+            prev_state: The previous state
+            next_state: The next state
+            reward: The reward for taking the respective action
+            prev_action: The previous action
+            next_action: The next action
+        Returns:
+            None
+        """
+        predict = self.Q[state, action]
+        target = reward + self.gamma * np.max(self.Q[state2, :])
+        self.Q[state, action] += self.alpha * (target - predict)
+episode = [
+    ["s1", "a1", -8],
+    ["s1", "a2", -16],
+    ["s2", "a1", 20],
+    ["s1", "a2", -10],
+    ["s2", "a1", None],
+]
+index_map = {
+    "s1": 0,
+    "s2": 1,
+    "a1": 0,
+    "a2": 1,
+}
+def main_r():
+    print("# QLearningAgent.py")
+    agent = QLearningAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
+    print(agent.Q)
+    for i in range(len(episode) - 1):
+        print(f"# Step {i + 1}")
+        s, a, r = episode[i]
+        s2, a2, r2 = episode[i + 1]
+        agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
+        print(agent.Q)
+if __name__ == "__main__":
+    main_r()

scripts/SarsaAgent.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import numpy as np
+from pip import main
+class SarsaAgent:
+    """
+    The Agent that uses SARSA update to improve it's behaviour
+    """
+    def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
+        """
+        Constructor
+        Args:
+            epsilon: The degree of exploration
+            gamma: The discount factor
+            num_state: The number of states
+            num_actions: The number of actions
+            action_space: To call the random action
+        """
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.gamma = gamma
+        self.num_state = num_state
+        self.num_actions = num_actions
+        self.Q = np.zeros((self.num_state, self.num_actions))
+        self.action_space = action_space
+    def update(self, prev_state, next_state, reward, prev_action, next_action):
+        """
+        Update the action value function using the SARSA update.
+        Q(S, A) = Q(S, A) + alpha(reward + (gamma * Q(S_, A_) - Q(S, A))
+        Args:
+            prev_state: The previous state
+            next_state: The next state
+            reward: The reward for taking the respective action
+            prev_action: The previous action
+            next_action: The next action
+        Returns:
+            None
+        """
+        predict = self.Q[prev_state, prev_action]
+        target = reward + self.gamma * self.Q[next_state, next_action]
+        self.Q[prev_state, prev_action] += self.alpha * (target - predict)
+episode = [
+    ["s1", "a1", -8],
+    ["s1", "a2", -16],
+    ["s2", "a1", 20],
+    ["s1", "a2", -10],
+    ["s2", "a1", None],
+]
+index_map = {
+    "s1": 0,
+    "s2": 1,
+    "a1": 0,
+    "a2": 1,
+}
+def main_r():
+    print("# SarsaAgent.py")
+    agent = SarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
+    print(agent.Q)
+    for i in range(len(episode) - 1):
+        print(f"# Step {i + 1}")
+        s, a, r = episode[i]
+        s2, a2, r2 = episode[i + 1]
+        agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
+        print(agent.Q)
+if __name__ == "__main__":
+    main_r()

scripts/nStepTDAgent.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# ExpectedSarsaAgent.py
+import sys
+from tabulate import tabulate
+import numpy as np
+episode = [["s1", "E", 0],
+           ["s2", "E", 1],
+           ["s3", "N", 2],
+           ["s3", "N", 3],
+           ["s3", "S", 4],
+           ["s6", "S", 5],
+           ["s9", None, None]]
+index_map = {
+    "s1": 0,
+    "s2": 1,
+    "s3": 2,
+    "s4": 3,
+    "s5": 4,
+    "s6": 5,
+    "s7": 6,
+    "s8": 7,
+    "s9": 8,
+    "N": 0,
+    "E": 1,
+    "S": 2,
+    "W": 3
+}
+class nStepTDAgent():
+    def __init__(self, alpha, gamma, num_state, num_actions):
+        """
+        Constructor
+        Args:
+            epsilon: The degree of exploration
+            gamma: The discount factor
+            num_state: The number of states
+            num_actions: The number of actions
+        """
+        self.alpha = alpha
+        self.gamma = gamma
+        self.num_state = num_state
+        self.num_actions = num_actions
+        self.Q = np.zeros((self.num_state, self.num_actions))
+    def run_episode(self, n, episode):
+        """
+        Update the action value function using the n-step TD update.
+        """
+        rew = [0, ]
+        bigT = sys.maxsize
+        print("T: ", bigT)
+        for t, step in enumerate(episode.reverse()):
+            print("=" * 80)
+            print("Step: ", t)
+            if t < bigT:
+                s_t, a_t, r_t1 = step
+                print(f" s_t: {s_t}, a_t: {a_t}, r_t1: {r_t1}")
+                s_t1, _, _ = episode[t + 1]
+                rew.append(r_t1)
+                _, _, r_t2 = episode[t + 1]
+                if r_t2 is None:
+                    bigT = t + 1
+                    print("TERMINAL => T: ", bigT)
+            Tt = t - n + 1
+            print(f" Tt: {Tt}")
+            if Tt >= 0:
+                print(f' ==============')
+                bigG = 0
+                for i in range(Tt + 1, min(Tt + n , bigT) + 1):
+                    print(f" i: {i}")
+                    r_t1 = rew[i]
+                    print(f" r_t{i}: {r_t1}")
+                    print(f"      {bigG} += {self.gamma}^{i - Tt - 1} * {r_t1}")
+                    bigG += self.gamma**(i - Tt - 1) * r_t1
+                print(f" G: {bigG}")
+                print(f' --------------')
+                if Tt + n < bigT:
+                    s_Tn, a_Tn = episode[Tt + n][0], episode[Tt + n][1]
+                    print(f"   s_Tn: {s_Tn}, a_Tn: {a_Tn}")
+                    s_Tn, a_Tn = index_map[s_Tn], index_map[a_Tn]
+                    print(f"      {bigG} += {self.gamma}^{n} * {self.Q[s_Tn, a_Tn]}")
+                    bigG += (self.gamma**n) * self.Q[s_Tn, a_Tn]
+                print(f" G: {bigG}")
+                print(f' ==============')
+                s_Tt, a_Tt = episode[Tt][0], episode[Tt][1]
+                print(f" => Update Q[{s_Tt}, {a_Tt}]")
+                s_Tti, a_Tti = index_map[s_Tt], index_map[a_Tt]
+                print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}")
+                self.Q[s_Tti, a_Tti] += self.alpha * (bigG - self.Q[s_Tti, a_Tti])
+                print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}")
+            print(f"Q:")
+            print(tabulate(self.Q, tablefmt="fancy_grid"))
+            if Tt == bigT - 1:
+                break
+def main_r():
+    print("# nStepTDAgent.py")
+    agent = nStepTDAgent(0.1, 0.9, 9, 4)
+    print(agent.Q)
+    agent.run_episode(3, episode)
+if __name__ == "__main__":
+    main_r()