lharri73 commited on
Commit
93d8108
·
1 Parent(s): ea6b281

merge updates

Browse files
DPAgent.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ import numpy as np
3
+ from gymnasium.envs.toy_text.frozen_lake import generate_random_map
4
+ from matplotlib import pyplot as plt
5
+ from tqdm import trange
6
+
7
+
8
+ class DP:
9
+ def __init__(self, env_name, gamma=0.9, theta=1e-10, **kwargs):
10
+ self.env = gym.make(env_name, **kwargs)
11
+ self.gamma = gamma
12
+ self.theta = theta
13
+ self.V = np.zeros(self.env.observation_space.n)
14
+ self.epsilon = 0
15
+
16
+ def policy(self, state, return_value=False):
17
+ Q = np.zeros(self.env.action_space.n)
18
+ for action in range(self.env.action_space.n):
19
+ expected_value = 0
20
+ for probability, next_state, reward, done in self.env.P[state][action]:
21
+ if state == self.env.observation_space.n-1: reward = 1
22
+ expected_value += probability * (reward + self.gamma * self.V[next_state])
23
+ Q[action] = expected_value
24
+ if return_value:
25
+ return np.argmax(Q), np.max(Q)
26
+ else:
27
+ return Q
28
+
29
+ def train(self):
30
+ i = 0
31
+ while True:
32
+ delta = 0
33
+ V_prev = np.copy(self.V)
34
+ for state in range(self.env.observation_space.n):
35
+ action, value = self.policy(state, return_value=True)
36
+ self.V[state] = value
37
+ delta = max(delta, abs(V_prev[state] - self.V[state]))
38
+ if delta < self.theta:
39
+ break
40
+ i += 1
41
+ print(f"Iteration {i}: delta={delta}")
42
+ # break
43
+
44
+ policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
45
+ return self.V, policy
46
+
47
+ def save_policy(self, pth):
48
+ np.save(pth, self.V)
49
+
50
+ def load_policy(self, pth):
51
+ """
52
+ not really loading the 'policy', but the state-value function but for
53
+ interface's sake, here we are.
54
+ """
55
+ self.V = np.load(pth)
56
+
57
+ def generate_episode(self, max_steps, render=False, **kwargs):
58
+ state, _ = self.env.reset()
59
+ episode_hist, solved, rgb_array = [], False, None
60
+
61
+ # Generate an episode following the current policy
62
+ for _ in range(max_steps):
63
+ rgb_array = self.env.render() if render else None
64
+ # Sample an action from the policy
65
+ action = self.policy(state)
66
+ maction = np.argmax(action)
67
+ # Take the action and observe the reward and next state
68
+ next_state, reward, done, truncated, _ = self.env.step(maction)
69
+ # Keeping track of the trajectory
70
+ episode_hist.append((state, maction, reward))
71
+ state = next_state
72
+
73
+ yield episode_hist, solved, rgb_array
74
+
75
+ # This is where the agent got to the goal.
76
+ # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
77
+ if done or truncated:
78
+ solved = True
79
+ break
80
+
81
+ rgb_array = self.env.render() if render else None
82
+
83
+ yield episode_hist, solved, rgb_array
84
+
85
+
86
+ if __name__ == "__main__":
87
+ env = gym.make('CliffWalking-v0', render_mode='human')
88
+ dp = DP(env)
89
+ dp.train()
90
+ dp.save_policy('dp_policy.npy')
91
+
92
+ state, _ = env.reset()
93
+ done = False
94
+ while not done:
95
+ action = dp.policy(state)
96
+ state, reward, done, _, _ = env.step(action)
97
+ env.render()
98
+
99
+ # plt.savefig(f"imgs/{0}.png")
demo.py CHANGED
@@ -3,6 +3,7 @@ import time
3
  import numpy as np
4
  import gradio as gr
5
  from MonteCarloAgent import MonteCarloAgent
 
6
  import scipy.ndimage
7
  import cv2
8
 
@@ -28,7 +29,7 @@ except FileNotFoundError:
28
  # All supported agents
29
  agent_map = {
30
  "MonteCarloAgent": MonteCarloAgent,
31
- # TODO: Add DP Agent
32
  }
33
  action_map = {
34
  "CliffWalking-v0": {
@@ -161,7 +162,9 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
161
  episode_hist[-2] if len(episode_hist) > 1 else (None, None, None)
162
  )
163
  state, action, reward = episode_hist[-1]
164
- curr_policy = agent.Pi[state]
 
 
165
 
166
  # frame_env = cv2.resize(
167
  # frame_env,
 
3
  import numpy as np
4
  import gradio as gr
5
  from MonteCarloAgent import MonteCarloAgent
6
+ from DPAgent import DP
7
  import scipy.ndimage
8
  import cv2
9
 
 
29
  # All supported agents
30
  agent_map = {
31
  "MonteCarloAgent": MonteCarloAgent,
32
+ "DPAgent": DP
33
  }
34
  action_map = {
35
  "CliffWalking-v0": {
 
162
  episode_hist[-2] if len(episode_hist) > 1 else (None, None, None)
163
  )
164
  state, action, reward = episode_hist[-1]
165
+ curr_policy = agent.policy(state)
166
+ curr_policy -= np.min(curr_policy)
167
+ curr_policy = curr_policy / np.sum(curr_policy)
168
 
169
  # frame_env = cv2.resize(
170
  # frame_env,
policies/DPAgent_CliffWalking-v0_i219_g0.9.npy ADDED
Binary file (512 Bytes). View file