Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
f902143
1
Parent(s):
a905cc7
Updates
Browse files- scripts/ExpectedSarsaAgent.py +104 -0
- scripts/MonteCarloAgent.py +51 -0
- scripts/QLearningAgent.py +71 -0
- scripts/SarsaAgent.py +76 -0
- scripts/nStepTDAgent.py +120 -0
scripts/ExpectedSarsaAgent.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
class ExpectedSarsaAgent:
|
5 |
+
def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
|
6 |
+
"""
|
7 |
+
Constructor
|
8 |
+
Args:
|
9 |
+
epsilon: The degree of exploration
|
10 |
+
gamma: The discount factor
|
11 |
+
num_state: The number of states
|
12 |
+
num_actions: The number of actions
|
13 |
+
action_space: To call the random action
|
14 |
+
"""
|
15 |
+
self.epsilon = epsilon
|
16 |
+
self.alpha = alpha
|
17 |
+
self.gamma = gamma
|
18 |
+
self.num_state = num_state
|
19 |
+
self.num_actions = num_actions
|
20 |
+
|
21 |
+
self.Q = np.zeros((self.num_state, self.num_actions))
|
22 |
+
self.action_space = action_space
|
23 |
+
|
24 |
+
def update(self, prev_state, next_state, reward, prev_action, next_action):
|
25 |
+
"""
|
26 |
+
Update the action value function using the Expected SARSA update.
|
27 |
+
Q(S, A) = Q(S, A) + alpha(reward + (pi * Q(S_, A_) - Q(S, A))
|
28 |
+
Args:
|
29 |
+
prev_state: The previous state
|
30 |
+
next_state: The next state
|
31 |
+
reward: The reward for taking the respective action
|
32 |
+
prev_action: The previous action
|
33 |
+
next_action: The next action
|
34 |
+
Returns:
|
35 |
+
None
|
36 |
+
"""
|
37 |
+
predict = self.Q[prev_state, prev_action]
|
38 |
+
|
39 |
+
expected_q = 0
|
40 |
+
q_max = np.max(self.Q[next_state, :])
|
41 |
+
greedy_actions = 0
|
42 |
+
for i in range(self.num_actions):
|
43 |
+
if self.Q[next_state][i] == q_max:
|
44 |
+
greedy_actions += 1
|
45 |
+
|
46 |
+
non_greedy_action_probability = self.epsilon / self.num_actions
|
47 |
+
greedy_action_probability = (
|
48 |
+
(1 - self.epsilon) / greedy_actions
|
49 |
+
) + non_greedy_action_probability
|
50 |
+
|
51 |
+
for i in range(self.num_actions):
|
52 |
+
if self.Q[next_state][i] == q_max:
|
53 |
+
expected_q += self.Q[next_state][i] * greedy_action_probability
|
54 |
+
else:
|
55 |
+
expected_q += self.Q[next_state][i] * non_greedy_action_probability
|
56 |
+
|
57 |
+
target = reward + self.gamma * expected_q
|
58 |
+
self.Q[prev_state, prev_action] += self.alpha * (target - predict)
|
59 |
+
|
60 |
+
|
61 |
+
episode = [
|
62 |
+
["s1", "a1", -8],
|
63 |
+
["s1", "a2", -16],
|
64 |
+
["s2", "a1", 20],
|
65 |
+
["s1", "a2", -10],
|
66 |
+
["s2", "a1", None],
|
67 |
+
]
|
68 |
+
|
69 |
+
index_map = {
|
70 |
+
"s1": 0,
|
71 |
+
"s2": 1,
|
72 |
+
"a1": 0,
|
73 |
+
"a2": 1,
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
def main_r():
|
78 |
+
print("# ExpectedSarsaAgent.py")
|
79 |
+
agent = ExpectedSarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
|
80 |
+
print(agent.Q)
|
81 |
+
for i in range(len(episode) - 1):
|
82 |
+
print(f"# Step {i + 1}")
|
83 |
+
s, a, r = episode[i]
|
84 |
+
s2, a2, _ = episode[i + 1]
|
85 |
+
agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
|
86 |
+
print(agent.Q)
|
87 |
+
|
88 |
+
|
89 |
+
def main_rn():
|
90 |
+
print("# ExpectedSarsaAgent.py")
|
91 |
+
agent = ExpectedSarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
|
92 |
+
print(agent.Q)
|
93 |
+
for i in range(len(episode) - 2):
|
94 |
+
print(f"# Step {i + 1}")
|
95 |
+
s, a, _ = episode[i]
|
96 |
+
s2, a2, r2 = episode[i + 1]
|
97 |
+
agent.update(index_map[s], index_map[s2], r2, index_map[a], index_map[a2])
|
98 |
+
print(agent.Q)
|
99 |
+
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
main_r()
|
103 |
+
print()
|
104 |
+
main_rn()
|
scripts/MonteCarloAgent.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
episodes = [
|
5 |
+
[["A", "a1", 3], ["A", "a2", 2], ["B", "a1", -4], ["A", "a1", 4], ["B", "a1", -3]],
|
6 |
+
[["B", "a1", -2], ["A", "a1", 3], ["B", "a2", -3]],
|
7 |
+
]
|
8 |
+
|
9 |
+
index_map = {
|
10 |
+
"states": {
|
11 |
+
"A": 0,
|
12 |
+
"B": 1,
|
13 |
+
},
|
14 |
+
"actions": {
|
15 |
+
"a1": 0,
|
16 |
+
"a2": 1,
|
17 |
+
},
|
18 |
+
}
|
19 |
+
|
20 |
+
|
21 |
+
def main_r():
|
22 |
+
print("# MonteCarloAgent.py")
|
23 |
+
alpha = 0.1
|
24 |
+
num_states = 2
|
25 |
+
|
26 |
+
v = np.zeros(num_states)
|
27 |
+
rets = {s: [] for s in index_map["states"].keys()}
|
28 |
+
|
29 |
+
for ep in episodes:
|
30 |
+
print("=" * 80)
|
31 |
+
g = 0
|
32 |
+
ep_len = len(ep)
|
33 |
+
print(f"# Episode: {ep} (steps: {ep_len}) G: {g}")
|
34 |
+
for t in range(ep_len - 1, -1, -1):
|
35 |
+
s, a, r = ep[t]
|
36 |
+
si = index_map["states"][s]
|
37 |
+
g = g + r
|
38 |
+
print(f"# Step {t + 1}:")
|
39 |
+
print(f"\ts: {s}, a: {a}, r: {r}")
|
40 |
+
print(f"\tG: {g}")
|
41 |
+
# unless st appears in the episode before time t
|
42 |
+
if s not in [x[0] for x in ep[:t]]:
|
43 |
+
rets[s].append(g)
|
44 |
+
v[si] = alpha * (sum(rets[s]) / len(rets[s]))
|
45 |
+
# v[si] = v[si] + alpha * (g - v[si])
|
46 |
+
|
47 |
+
print(f"\tV[{s}] = {v[si]}")
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
main_r()
|
scripts/QLearningAgent.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
class QLearningAgent:
|
5 |
+
def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
|
6 |
+
"""
|
7 |
+
Constructor
|
8 |
+
Args:
|
9 |
+
epsilon: The degree of exploration
|
10 |
+
gamma: The discount factor
|
11 |
+
num_state: The number of states
|
12 |
+
num_actions: The number of actions
|
13 |
+
action_space: To call the random action
|
14 |
+
"""
|
15 |
+
self.epsilon = epsilon
|
16 |
+
self.alpha = alpha
|
17 |
+
self.gamma = gamma
|
18 |
+
self.num_state = num_state
|
19 |
+
self.num_actions = num_actions
|
20 |
+
|
21 |
+
self.Q = np.zeros((self.num_state, self.num_actions))
|
22 |
+
self.action_space = action_space
|
23 |
+
|
24 |
+
def update(self, state, state2, reward, action, action2):
|
25 |
+
"""
|
26 |
+
Update the action value function using the Q-Learning update.
|
27 |
+
Q(S, A) = Q(S, A) + alpha(reward + (gamma * Q(S_, A_) - Q(S, A))
|
28 |
+
Args:
|
29 |
+
prev_state: The previous state
|
30 |
+
next_state: The next state
|
31 |
+
reward: The reward for taking the respective action
|
32 |
+
prev_action: The previous action
|
33 |
+
next_action: The next action
|
34 |
+
Returns:
|
35 |
+
None
|
36 |
+
"""
|
37 |
+
predict = self.Q[state, action]
|
38 |
+
target = reward + self.gamma * np.max(self.Q[state2, :])
|
39 |
+
self.Q[state, action] += self.alpha * (target - predict)
|
40 |
+
|
41 |
+
|
42 |
+
episode = [
|
43 |
+
["s1", "a1", -8],
|
44 |
+
["s1", "a2", -16],
|
45 |
+
["s2", "a1", 20],
|
46 |
+
["s1", "a2", -10],
|
47 |
+
["s2", "a1", None],
|
48 |
+
]
|
49 |
+
|
50 |
+
index_map = {
|
51 |
+
"s1": 0,
|
52 |
+
"s2": 1,
|
53 |
+
"a1": 0,
|
54 |
+
"a2": 1,
|
55 |
+
}
|
56 |
+
|
57 |
+
|
58 |
+
def main_r():
|
59 |
+
print("# QLearningAgent.py")
|
60 |
+
agent = QLearningAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
|
61 |
+
print(agent.Q)
|
62 |
+
for i in range(len(episode) - 1):
|
63 |
+
print(f"# Step {i + 1}")
|
64 |
+
s, a, r = episode[i]
|
65 |
+
s2, a2, r2 = episode[i + 1]
|
66 |
+
agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
|
67 |
+
print(agent.Q)
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
main_r()
|
scripts/SarsaAgent.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from pip import main
|
3 |
+
|
4 |
+
|
5 |
+
class SarsaAgent:
|
6 |
+
"""
|
7 |
+
The Agent that uses SARSA update to improve it's behaviour
|
8 |
+
"""
|
9 |
+
|
10 |
+
def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space):
|
11 |
+
"""
|
12 |
+
Constructor
|
13 |
+
Args:
|
14 |
+
epsilon: The degree of exploration
|
15 |
+
gamma: The discount factor
|
16 |
+
num_state: The number of states
|
17 |
+
num_actions: The number of actions
|
18 |
+
action_space: To call the random action
|
19 |
+
"""
|
20 |
+
self.epsilon = epsilon
|
21 |
+
self.alpha = alpha
|
22 |
+
self.gamma = gamma
|
23 |
+
self.num_state = num_state
|
24 |
+
self.num_actions = num_actions
|
25 |
+
|
26 |
+
self.Q = np.zeros((self.num_state, self.num_actions))
|
27 |
+
self.action_space = action_space
|
28 |
+
|
29 |
+
def update(self, prev_state, next_state, reward, prev_action, next_action):
|
30 |
+
"""
|
31 |
+
Update the action value function using the SARSA update.
|
32 |
+
Q(S, A) = Q(S, A) + alpha(reward + (gamma * Q(S_, A_) - Q(S, A))
|
33 |
+
Args:
|
34 |
+
prev_state: The previous state
|
35 |
+
next_state: The next state
|
36 |
+
reward: The reward for taking the respective action
|
37 |
+
prev_action: The previous action
|
38 |
+
next_action: The next action
|
39 |
+
Returns:
|
40 |
+
None
|
41 |
+
"""
|
42 |
+
predict = self.Q[prev_state, prev_action]
|
43 |
+
target = reward + self.gamma * self.Q[next_state, next_action]
|
44 |
+
self.Q[prev_state, prev_action] += self.alpha * (target - predict)
|
45 |
+
|
46 |
+
|
47 |
+
episode = [
|
48 |
+
["s1", "a1", -8],
|
49 |
+
["s1", "a2", -16],
|
50 |
+
["s2", "a1", 20],
|
51 |
+
["s1", "a2", -10],
|
52 |
+
["s2", "a1", None],
|
53 |
+
]
|
54 |
+
|
55 |
+
index_map = {
|
56 |
+
"s1": 0,
|
57 |
+
"s2": 1,
|
58 |
+
"a1": 0,
|
59 |
+
"a2": 1,
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
def main_r():
|
64 |
+
print("# SarsaAgent.py")
|
65 |
+
agent = SarsaAgent(0.1, 0.5, 0.5, 2, 2, [0, 1])
|
66 |
+
print(agent.Q)
|
67 |
+
for i in range(len(episode) - 1):
|
68 |
+
print(f"# Step {i + 1}")
|
69 |
+
s, a, r = episode[i]
|
70 |
+
s2, a2, r2 = episode[i + 1]
|
71 |
+
agent.update(index_map[s], index_map[s2], r, index_map[a], index_map[a2])
|
72 |
+
print(agent.Q)
|
73 |
+
|
74 |
+
|
75 |
+
if __name__ == "__main__":
|
76 |
+
main_r()
|
scripts/nStepTDAgent.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ExpectedSarsaAgent.py
|
2 |
+
|
3 |
+
import sys
|
4 |
+
|
5 |
+
from tabulate import tabulate
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
episode = [["s1", "E", 0],
|
9 |
+
["s2", "E", 1],
|
10 |
+
["s3", "N", 2],
|
11 |
+
["s3", "N", 3],
|
12 |
+
["s3", "S", 4],
|
13 |
+
["s6", "S", 5],
|
14 |
+
["s9", None, None]]
|
15 |
+
|
16 |
+
index_map = {
|
17 |
+
"s1": 0,
|
18 |
+
"s2": 1,
|
19 |
+
"s3": 2,
|
20 |
+
"s4": 3,
|
21 |
+
"s5": 4,
|
22 |
+
"s6": 5,
|
23 |
+
"s7": 6,
|
24 |
+
"s8": 7,
|
25 |
+
"s9": 8,
|
26 |
+
"N": 0,
|
27 |
+
"E": 1,
|
28 |
+
"S": 2,
|
29 |
+
"W": 3
|
30 |
+
}
|
31 |
+
|
32 |
+
class nStepTDAgent():
|
33 |
+
def __init__(self, alpha, gamma, num_state, num_actions):
|
34 |
+
"""
|
35 |
+
Constructor
|
36 |
+
Args:
|
37 |
+
epsilon: The degree of exploration
|
38 |
+
gamma: The discount factor
|
39 |
+
num_state: The number of states
|
40 |
+
num_actions: The number of actions
|
41 |
+
"""
|
42 |
+
self.alpha = alpha
|
43 |
+
self.gamma = gamma
|
44 |
+
self.num_state = num_state
|
45 |
+
self.num_actions = num_actions
|
46 |
+
|
47 |
+
self.Q = np.zeros((self.num_state, self.num_actions))
|
48 |
+
|
49 |
+
def run_episode(self, n, episode):
|
50 |
+
"""
|
51 |
+
Update the action value function using the n-step TD update.
|
52 |
+
"""
|
53 |
+
|
54 |
+
rew = [0, ]
|
55 |
+
|
56 |
+
bigT = sys.maxsize
|
57 |
+
print("T: ", bigT)
|
58 |
+
for t, step in enumerate(episode.reverse()):
|
59 |
+
print("=" * 80)
|
60 |
+
print("Step: ", t)
|
61 |
+
if t < bigT:
|
62 |
+
s_t, a_t, r_t1 = step
|
63 |
+
print(f" s_t: {s_t}, a_t: {a_t}, r_t1: {r_t1}")
|
64 |
+
s_t1, _, _ = episode[t + 1]
|
65 |
+
rew.append(r_t1)
|
66 |
+
|
67 |
+
_, _, r_t2 = episode[t + 1]
|
68 |
+
if r_t2 is None:
|
69 |
+
bigT = t + 1
|
70 |
+
print("TERMINAL => T: ", bigT)
|
71 |
+
|
72 |
+
Tt = t - n + 1
|
73 |
+
print(f" Tt: {Tt}")
|
74 |
+
if Tt >= 0:
|
75 |
+
print(f' ==============')
|
76 |
+
bigG = 0
|
77 |
+
for i in range(Tt + 1, min(Tt + n , bigT) + 1):
|
78 |
+
print(f" i: {i}")
|
79 |
+
r_t1 = rew[i]
|
80 |
+
print(f" r_t{i}: {r_t1}")
|
81 |
+
print(f" {bigG} += {self.gamma}^{i - Tt - 1} * {r_t1}")
|
82 |
+
bigG += self.gamma**(i - Tt - 1) * r_t1
|
83 |
+
print(f" G: {bigG}")
|
84 |
+
print(f' --------------')
|
85 |
+
if Tt + n < bigT:
|
86 |
+
s_Tn, a_Tn = episode[Tt + n][0], episode[Tt + n][1]
|
87 |
+
|
88 |
+
print(f" s_Tn: {s_Tn}, a_Tn: {a_Tn}")
|
89 |
+
s_Tn, a_Tn = index_map[s_Tn], index_map[a_Tn]
|
90 |
+
print(f" {bigG} += {self.gamma}^{n} * {self.Q[s_Tn, a_Tn]}")
|
91 |
+
bigG += (self.gamma**n) * self.Q[s_Tn, a_Tn]
|
92 |
+
print(f" G: {bigG}")
|
93 |
+
print(f' ==============')
|
94 |
+
|
95 |
+
s_Tt, a_Tt = episode[Tt][0], episode[Tt][1]
|
96 |
+
print(f" => Update Q[{s_Tt}, {a_Tt}]")
|
97 |
+
s_Tti, a_Tti = index_map[s_Tt], index_map[a_Tt]
|
98 |
+
print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}")
|
99 |
+
self.Q[s_Tti, a_Tti] += self.alpha * (bigG - self.Q[s_Tti, a_Tti])
|
100 |
+
print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}")
|
101 |
+
print(f"Q:")
|
102 |
+
print(tabulate(self.Q, tablefmt="fancy_grid"))
|
103 |
+
if Tt == bigT - 1:
|
104 |
+
break
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
def main_r():
|
112 |
+
print("# nStepTDAgent.py")
|
113 |
+
agent = nStepTDAgent(0.1, 0.9, 9, 4)
|
114 |
+
print(agent.Q)
|
115 |
+
agent.run_episode(3, episode)
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
if __name__ == "__main__":
|
120 |
+
main_r()
|