Spaces:
Sleeping
Sleeping
# ExpectedSarsaAgent.py | |
import sys | |
from tabulate import tabulate | |
import numpy as np | |
episode = [["s1", "E", 0], | |
["s2", "E", 1], | |
["s3", "N", 2], | |
["s3", "N", 3], | |
["s3", "S", 4], | |
["s6", "S", 5], | |
["s9", None, None]] | |
index_map = { | |
"s1": 0, | |
"s2": 1, | |
"s3": 2, | |
"s4": 3, | |
"s5": 4, | |
"s6": 5, | |
"s7": 6, | |
"s8": 7, | |
"s9": 8, | |
"N": 0, | |
"E": 1, | |
"S": 2, | |
"W": 3 | |
} | |
class nStepTDAgent(): | |
def __init__(self, alpha, gamma, num_state, num_actions): | |
""" | |
Constructor | |
Args: | |
epsilon: The degree of exploration | |
gamma: The discount factor | |
num_state: The number of states | |
num_actions: The number of actions | |
""" | |
self.alpha = alpha | |
self.gamma = gamma | |
self.num_state = num_state | |
self.num_actions = num_actions | |
self.Q = np.zeros((self.num_state, self.num_actions)) | |
def run_episode(self, n, episode): | |
""" | |
Update the action value function using the n-step TD update. | |
""" | |
rew = [0, ] | |
bigT = sys.maxsize | |
print("T: ", bigT) | |
for t, step in enumerate(episode.reverse()): | |
print("=" * 80) | |
print("Step: ", t) | |
if t < bigT: | |
s_t, a_t, r_t1 = step | |
print(f" s_t: {s_t}, a_t: {a_t}, r_t1: {r_t1}") | |
s_t1, _, _ = episode[t + 1] | |
rew.append(r_t1) | |
_, _, r_t2 = episode[t + 1] | |
if r_t2 is None: | |
bigT = t + 1 | |
print("TERMINAL => T: ", bigT) | |
Tt = t - n + 1 | |
print(f" Tt: {Tt}") | |
if Tt >= 0: | |
print(f' ==============') | |
bigG = 0 | |
for i in range(Tt + 1, min(Tt + n , bigT) + 1): | |
print(f" i: {i}") | |
r_t1 = rew[i] | |
print(f" r_t{i}: {r_t1}") | |
print(f" {bigG} += {self.gamma}^{i - Tt - 1} * {r_t1}") | |
bigG += self.gamma**(i - Tt - 1) * r_t1 | |
print(f" G: {bigG}") | |
print(f' --------------') | |
if Tt + n < bigT: | |
s_Tn, a_Tn = episode[Tt + n][0], episode[Tt + n][1] | |
print(f" s_Tn: {s_Tn}, a_Tn: {a_Tn}") | |
s_Tn, a_Tn = index_map[s_Tn], index_map[a_Tn] | |
print(f" {bigG} += {self.gamma}^{n} * {self.Q[s_Tn, a_Tn]}") | |
bigG += (self.gamma**n) * self.Q[s_Tn, a_Tn] | |
print(f" G: {bigG}") | |
print(f' ==============') | |
s_Tt, a_Tt = episode[Tt][0], episode[Tt][1] | |
print(f" => Update Q[{s_Tt}, {a_Tt}]") | |
s_Tti, a_Tti = index_map[s_Tt], index_map[a_Tt] | |
print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}") | |
self.Q[s_Tti, a_Tti] += self.alpha * (bigG - self.Q[s_Tti, a_Tti]) | |
print(f" Q[{s_Tt}, {a_Tt}] = {self.Q[s_Tti, a_Tti]}") | |
print(f"Q:") | |
print(tabulate(self.Q, tablefmt="fancy_grid")) | |
if Tt == bigT - 1: | |
break | |
def main_r(): | |
print("# nStepTDAgent.py") | |
agent = nStepTDAgent(0.1, 0.9, 9, 4) | |
print(agent.Q) | |
agent.run_episode(3, episode) | |
if __name__ == "__main__": | |
main_r() | |