Spaces:
Sleeping
Sleeping
Update
Browse files- DPAgent.py +7 -34
- MCAgent.py +1 -0
DPAgent.py
CHANGED
@@ -3,15 +3,14 @@ import numpy as np
|
|
3 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
4 |
from matplotlib import pyplot as plt
|
5 |
from tqdm import trange
|
|
|
6 |
|
7 |
|
8 |
-
class DPAgent:
|
9 |
-
def __init__(self,
|
10 |
-
|
11 |
-
self.gamma = gamma
|
12 |
self.theta = theta
|
13 |
self.V = np.zeros(self.env.observation_space.n)
|
14 |
-
self.epsilon = 0
|
15 |
self.Pi = None
|
16 |
|
17 |
def policy(self, state):
|
@@ -39,7 +38,7 @@ class DPAgent:
|
|
39 |
if delta < self.theta:
|
40 |
break
|
41 |
i += 1
|
42 |
-
|
43 |
print(f"Iteration {i}: delta={delta}")
|
44 |
# break
|
45 |
|
@@ -52,36 +51,10 @@ class DPAgent:
|
|
52 |
# if state == self.env.observation_space.n-1: reward = 1
|
53 |
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
54 |
self.Pi[s,a] = expected_value
|
|
|
|
|
55 |
# return self.V, self.Pi
|
56 |
|
57 |
-
def generate_episode(self, max_steps, render=False, **kwargs):
|
58 |
-
state, _ = self.env.reset()
|
59 |
-
episode_hist, solved, rgb_array = [], False, None
|
60 |
-
|
61 |
-
# Generate an episode following the current policy
|
62 |
-
for _ in range(max_steps):
|
63 |
-
rgb_array = self.env.render() if render else None
|
64 |
-
# Sample an action from the policy
|
65 |
-
action = self.policy(state)
|
66 |
-
maction = np.argmax(action)
|
67 |
-
# Take the action and observe the reward and next state
|
68 |
-
next_state, reward, done, truncated, _ = self.env.step(maction)
|
69 |
-
# Keeping track of the trajectory
|
70 |
-
episode_hist.append((state, maction, reward))
|
71 |
-
state = next_state
|
72 |
-
|
73 |
-
yield episode_hist, solved, rgb_array
|
74 |
-
|
75 |
-
# This is where the agent got to the goal.
|
76 |
-
# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
|
77 |
-
if done or truncated:
|
78 |
-
solved = True
|
79 |
-
break
|
80 |
-
|
81 |
-
rgb_array = self.env.render() if render else None
|
82 |
-
|
83 |
-
yield episode_hist, solved, rgb_array
|
84 |
-
|
85 |
|
86 |
if __name__ == "__main__":
|
87 |
# env = gym.make('FrozenLake-v1', render_mode='human')
|
|
|
3 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
4 |
from matplotlib import pyplot as plt
|
5 |
from tqdm import trange
|
6 |
+
from Shared import Shared
|
7 |
|
8 |
|
9 |
+
class DPAgent(Shared):
|
10 |
+
def __init__(self, theta=1e-10, **kwargs):
|
11 |
+
super().__init__(**kwargs)
|
|
|
12 |
self.theta = theta
|
13 |
self.V = np.zeros(self.env.observation_space.n)
|
|
|
14 |
self.Pi = None
|
15 |
|
16 |
def policy(self, state):
|
|
|
38 |
if delta < self.theta:
|
39 |
break
|
40 |
i += 1
|
41 |
+
self.test()
|
42 |
print(f"Iteration {i}: delta={delta}")
|
43 |
# break
|
44 |
|
|
|
51 |
# if state == self.env.observation_space.n-1: reward = 1
|
52 |
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
53 |
self.Pi[s,a] = expected_value
|
54 |
+
self.Pi = np.argmax(self.Pi, axis=1)
|
55 |
+
print(self.Pi)
|
56 |
# return self.V, self.Pi
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if __name__ == "__main__":
|
60 |
# env = gym.make('FrozenLake-v1', render_mode='human')
|
MCAgent.py
CHANGED
@@ -2,6 +2,7 @@ import numpy as np
|
|
2 |
from tqdm import tqdm
|
3 |
from Shared import Shared
|
4 |
import wandb
|
|
|
5 |
|
6 |
class MCAgent(Shared):
|
7 |
|
|
|
2 |
from tqdm import tqdm
|
3 |
from Shared import Shared
|
4 |
import wandb
|
5 |
+
from Shared import Shared
|
6 |
|
7 |
class MCAgent(Shared):
|
8 |
|