Andrei Cozma commited on
Commit
f902143
·
1 Parent(s): a905cc7
scripts/ExpectedSarsaAgent.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class ExpectedSarsaAgent:
5
+ def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
6
+ """
7
+ Constructor
8
+ Args:
9
+ epsilon: The degree of exploration
10
+ gamma: The discount factor
11
+ num_state: The number of states
12
+ num_actions: The number of actions
13
+ action_space: To call the random action
14
+ """
15
+ self.epsilon = epsilon
16
+ self.alpha = alpha
17
+ self.gamma = gamma
18
+ self.num_state = num_state
19
+ self.num_actions = num_actions
20
+
21
+ self.Q = np.zeros((self.num_state, self.num_actions))
22
+ self.action_space = action_space
23
+
24
+ def update(self, prev_state, next_state, reward, prev_action, next_action):
25
+ """
26
+ Update the action value function using the Expected SARSA update.
27
+ Q(S, A) = Q(S, A) + alpha(reward + (pi * Q(S_, A_) - Q(S, A))
28
+ Args:
29
+ prev_state: The previous state
30
+ next_state: The next state
31
+ reward: The reward for taking the respective action
32
+ prev_action: The previous action
33
+ next_action: The next action
34
+ Returns:
35
+ None
36
+ """
37
+ predict = self.Q[prev_state, prev_action]
38
+
39
+ expected_q = 0
40
+ q_max = np.max(self.Q[next_state, :])
41
+ greedy_actions = 0
42
+ for i in range(self.num_actions):
43
+ if self.Q[next_state][i] == q_max:
44
+ greedy_actions += 1
45
+
46
+ non_greedy_action_probability = self.epsilon / self.num_actions
47
+ greedy_action_probability = (
48
+ (1 - self.epsilon) / greedy_actions
49
+ ) + non_greedy_action_probability
50
+
51
+ for i in range(self.num_actions):
52
+ if self.Q[next_state][i] == q_max:
53
+ expected_q += self.Q[next_state][i] * greedy_action_probability
54
+ else:
55
+ expected_q += self.Q[next_state][i] * non_greedy_action_probability
56
+
57
+ target = reward + self.gamma * expected_q
58
+ self.Q[prev_state, prev_action] += self.alpha * (target - predict)
59
+
60
+
61
+ episode = [
62
+ ["s1", "a1", -8],
63
+ ["s1", "a2", -16],
64
+ ["s2", "a1", 20],
65
+ ["s1", "a2", -10],
66
+ ["s2", "a1", None],
67
+ ]
68
+
69
+ index_map = {
70
+ "s1": 0,
71
+ "s2": 1,
72
+ "a1": 0,
73
+ "a2": 1,
74
+ }
75
+
76
+
77
+ def main_r():
78
+ print("# ExpectedSarsaAgent.py")
79
+ agent = ExpectedSarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
80
+ print(agent.Q)
81
+ for i in range(len(episode) - 1):
82
+ print(f"# Step {i + 1}")
83
+ s, a, r = episode[i]
84
+ s2, a2, _ = episode[i + 1]
85
+ agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
86
+ print(agent.Q)
87
+
88
+
89
+ def main_rn():
90
+ print("# ExpectedSarsaAgent.py")
91
+ agent = ExpectedSarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
92
+ print(agent.Q)
93
+ for i in range(len(episode) - 2):
94
+ print(f"# Step {i + 1}")
95
+ s, a, _ = episode[i]
96
+ s2, a2, r2 = episode[i + 1]
97
+ agent.update(index_map[s], index_map[s2], r2, index_map[a], index_map[a2])
98
+ print(agent.Q)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ main_r()
103
+ print()
104
+ main_rn()
scripts/MonteCarloAgent.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ episodes = [
5
+ [["A", "a1", 3], ["A", "a2", 2], ["B", "a1", -4], ["A", "a1", 4], ["B", "a1", -3]],
6
+ [["B", "a1", -2], ["A", "a1", 3], ["B", "a2", -3]],
7
+ ]
8
+
9
+ index_map = {
10
+ "states": {
11
+ "A": 0,
12
+ "B": 1,
13
+ },
14
+ "actions": {
15
+ "a1": 0,
16
+ "a2": 1,
17
+ },
18
+ }
19
+
20
+
21
+ def main_r():
22
+ print("# MonteCarloAgent.py")
23
+ alpha = 0.1
24
+ num_states = 2
25
+
26
+ v = np.zeros(num_states)
27
+ rets = {s: [] for s in index_map["states"].keys()}
28
+
29
+ for ep in episodes:
30
+ print("=" * 80)
31
+ g = 0
32
+ ep_len = len(ep)
33
+ print(f"# Episode: {ep} (steps: {ep_len}) G: {g}")
34
+ for t in range(ep_len - 1, -1, -1):
35
+ s, a, r = ep[t]
36
+ si = index_map["states"][s]
37
+ g = g + r
38
+ print(f"# Step {t + 1}:")
39
+ print(f"\ts: {s}, a: {a}, r: {r}")
40
+ print(f"\tG: {g}")
41
+ # unless st appears in the episode before time t
42
+ if s not in [x[0] for x in ep[:t]]:
43
+ rets[s].append(g)
44
+ v[si] = alpha * (sum(rets[s]) / len(rets[s]))
45
+ # v[si] = v[si] + alpha * (g - v[si])
46
+
47
+ print(f"\tV[{s}] = {v[si]}")
48
+
49
+
50
+ if __name__ == "__main__":
51
+ main_r()
scripts/QLearningAgent.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class QLearningAgent:
5
+ def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
6
+ """
7
+ Constructor
8
+ Args:
9
+ epsilon: The degree of exploration
10
+ gamma: The discount factor
11
+ num_state: The number of states
12
+ num_actions: The number of actions
13
+ action_space: To call the random action
14
+ """
15
+ self.epsilon = epsilon
16
+ self.alpha = alpha
17
+ self.gamma = gamma
18
+ self.num_state = num_state
19
+ self.num_actions = num_actions
20
+
21
+ self.Q = np.zeros((self.num_state, self.num_actions))
22
+ self.action_space = action_space
23
+
24
+ def update(self, state, state2, reward, action, action2):
25
+ """
26
+ Update the action value function using the Q-Learning update.
27
+ Q(S, A) = Q(S, A) + alpha(reward + (gamma * Q(S_, A_) - Q(S, A))
28
+ Args:
29
+ prev_state: The previous state
30
+ next_state: The next state
31
+ reward: The reward for taking the respective action
32
+ prev_action: The previous action
33
+ next_action: The next action
34
+ Returns:
35
+ None
36
+ """
37
+ predict = self.Q[state, action]
38
+ target = reward + self.gamma * np.max(self.Q[state2, :])
39
+ self.Q[state, action] += self.alpha * (target - predict)
40
+
41
+
42
+ episode = [
43
+ ["s1", "a1", -8],
44
+ ["s1", "a2", -16],
45
+ ["s2", "a1", 20],
46
+ ["s1", "a2", -10],
47
+ ["s2", "a1", None],
48
+ ]
49
+
50
+ index_map = {
51
+ "s1": 0,
52
+ "s2": 1,
53
+ "a1": 0,
54
+ "a2": 1,
55
+ }
56
+
57
+
58
+ def main_r():
59
+ print("# QLearningAgent.py")
60
+ agent = QLearningAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
61
+ print(agent.Q)
62
+ for i in range(len(episode) - 1):
63
+ print(f"# Step {i + 1}")
64
+ s, a, r = episode[i]
65
+ s2, a2, r2 = episode[i + 1]
66
+ agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
67
+ print(agent.Q)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main_r()
scripts/SarsaAgent.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from pip import main
3
+
4
+
5
+ class SarsaAgent:
6
+ """
7
+ The Agent that uses SARSA update to improve it's behaviour
8
+ """
9
+
10
+ def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
11
+ """
12
+ Constructor
13
+ Args:
14
+ epsilon: The degree of exploration
15
+ gamma: The discount factor
16
+ num_state: The number of states
17
+ num_actions: The number of actions
18
+ action_space: To call the random action
19
+ """
20
+ self.epsilon = epsilon
21
+ self.alpha = alpha
22
+ self.gamma = gamma
23
+ self.num_state = num_state
24
+ self.num_actions = num_actions
25
+
26
+ self.Q = np.zeros((self.num_state, self.num_actions))
27
+ self.action_space = action_space
28
+
29
+ def update(self, prev_state, next_state, reward, prev_action, next_action):
30
+ """
31
+ Update the action value function using the SARSA update.
32
+ Q(S, A) = Q(S, A) + alpha(reward + (gamma * Q(S_, A_) - Q(S, A))
33
+ Args:
34
+ prev_state: The previous state
35
+ next_state: The next state
36
+ reward: The reward for taking the respective action
37
+ prev_action: The previous action
38
+ next_action: The next action
39
+ Returns:
40
+ None
41
+ """
42
+ predict = self.Q[prev_state, prev_action]
43
+ target = reward + self.gamma * self.Q[next_state, next_action]
44
+ self.Q[prev_state, prev_action] += self.alpha * (target - predict)
45
+
46
+
47
+ episode = [
48
+ ["s1", "a1", -8],
49
+ ["s1", "a2", -16],
50
+ ["s2", "a1", 20],
51
+ ["s1", "a2", -10],
52
+ ["s2", "a1", None],
53
+ ]
54
+
55
+ index_map = {
56
+ "s1": 0,
57
+ "s2": 1,
58
+ "a1": 0,
59
+ "a2": 1,
60
+ }
61
+
62
+
63
+ def main_r():
64
+ print("# SarsaAgent.py")
65
+ agent = SarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
66
+ print(agent.Q)
67
+ for i in range(len(episode) - 1):
68
+ print(f"# Step {i + 1}")
69
+ s, a, r = episode[i]
70
+ s2, a2, r2 = episode[i + 1]
71
+ agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
72
+ print(agent.Q)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main_r()
scripts/nStepTDAgent.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ExpectedSarsaAgent.py
2
+
3
+ import sys
4
+
5
+ from tabulate import tabulate
6
+ import numpy as np
7
+
8
+ episode = [["s1", "E", 0],
9
+ ["s2", "E", 1],
10
+ ["s3", "N", 2],
11
+ ["s3", "N", 3],
12
+ ["s3", "S", 4],
13
+ ["s6", "S", 5],
14
+ ["s9", None, None]]
15
+
16
+ index_map = {
17
+ "s1": 0,
18
+ "s2": 1,
19
+ "s3": 2,
20
+ "s4": 3,
21
+ "s5": 4,
22
+ "s6": 5,
23
+ "s7": 6,
24
+ "s8": 7,
25
+ "s9": 8,
26
+ "N": 0,
27
+ "E": 1,
28
+ "S": 2,
29
+ "W": 3
30
+ }
31
+
32
+ class nStepTDAgent():
33
+ def __init__(self, alpha, gamma, num_state, num_actions):
34
+ """
35
+ Constructor
36
+ Args:
37
+ epsilon: The degree of exploration
38
+ gamma: The discount factor
39
+ num_state: The number of states
40
+ num_actions: The number of actions
41
+ """
42
+ self.alpha = alpha
43
+ self.gamma = gamma
44
+ self.num_state = num_state
45
+ self.num_actions = num_actions
46
+
47
+ self.Q = np.zeros((self.num_state, self.num_actions))
48
+
49
+ def run_episode(self, n, episode):
50
+ """
51
+ Update the action value function using the n-step TD update.
52
+ """
53
+
54
+ rew = [0, ]
55
+
56
+ bigT = sys.maxsize
57
+ print("T: ", bigT)
58
+ for t, step in enumerate(episode.reverse()):
59
+ print("=" * 80)
60
+ print("Step: ", t)
61
+ if t < bigT:
62
+ s_t, a_t, r_t1 = step
63
+ print(f" s_t: {s_t}, a_t: {a_t}, r_t1: {r_t1}")
64
+ s_t1, _, _ = episode[t + 1]
65
+ rew.append(r_t1)
66
+
67
+ _, _, r_t2 = episode[t + 1]
68
+ if r_t2 is None:
69
+ bigT = t + 1
70
+ print("TERMINAL => T: ", bigT)
71
+
72
+ Tt = t - n + 1
73
+ print(f" Tt: {Tt}")
74
+ if Tt >= 0:
75
+ print(f' ==============')
76
+ bigG = 0
77
+ for i in range(Tt + 1, min(Tt + n , bigT) + 1):
78
+ print(f" i: {i}")
79
+ r_t1 = rew[i]
80
+ print(f" r_t{i}: {r_t1}")
81
+ print(f" {bigG} += {self.gamma}^{i - Tt - 1} * {r_t1}")
82
+ bigG += self.gamma**(i - Tt - 1) * r_t1
83
+ print(f" G: {bigG}")
84
+ print(f' --------------')
85
+ if Tt + n < bigT:
86
+ s_Tn, a_Tn = episode[Tt + n][0], episode[Tt + n][1]
87
+
88
+ print(f" s_Tn: {s_Tn}, a_Tn: {a_Tn}")
89
+ s_Tn, a_Tn = index_map[s_Tn], index_map[a_Tn]
90
+ print(f" {bigG} += {self.gamma}^{n} * {self.Q[s_Tn, a_Tn]}")
91
+ bigG += (self.gamma**n) * self.Q[s_Tn, a_Tn]
92
+ print(f" G: {bigG}")
93
+ print(f' ==============')
94
+
95
+ s_Tt, a_Tt = episode[Tt][0], episode[Tt][1]
96
+ print(f" => Update Q[{s_Tt}, {a_Tt}]")
97
+ s_Tti, a_Tti = index_map[s_Tt], index_map[a_Tt]
98
+ print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}")
99
+ self.Q[s_Tti, a_Tti] += self.alpha * (bigG - self.Q[s_Tti, a_Tti])
100
+ print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}")
101
+ print(f"Q:")
102
+ print(tabulate(self.Q, tablefmt="fancy_grid"))
103
+ if Tt == bigT - 1:
104
+ break
105
+
106
+
107
+
108
+
109
+
110
+
111
+ def main_r():
112
+ print("# nStepTDAgent.py")
113
+ agent = nStepTDAgent(0.1, 0.9, 9, 4)
114
+ print(agent.Q)
115
+ agent.run_episode(3, episode)
116
+
117
+
118
+
119
+ if __name__ == "__main__":
120
+ main_r()