Spaces:
Sleeping
Sleeping
merge updates
Browse files- DPAgent.py +99 -0
- demo.py +5 -2
- policies/DPAgent_CliffWalking-v0_i219_g0.9.npy +0 -0
DPAgent.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gymnasium as gym
|
2 |
+
import numpy as np
|
3 |
+
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
4 |
+
from matplotlib import pyplot as plt
|
5 |
+
from tqdm import trange
|
6 |
+
|
7 |
+
|
8 |
+
class DP:
|
9 |
+
def __init__(self, env_name, gamma=0.9, theta=1e-10, **kwargs):
|
10 |
+
self.env = gym.make(env_name, **kwargs)
|
11 |
+
self.gamma = gamma
|
12 |
+
self.theta = theta
|
13 |
+
self.V = np.zeros(self.env.observation_space.n)
|
14 |
+
self.epsilon = 0
|
15 |
+
|
16 |
+
def policy(self, state, return_value=False):
|
17 |
+
Q = np.zeros(self.env.action_space.n)
|
18 |
+
for action in range(self.env.action_space.n):
|
19 |
+
expected_value = 0
|
20 |
+
for probability, next_state, reward, done in self.env.P[state][action]:
|
21 |
+
if state == self.env.observation_space.n-1: reward = 1
|
22 |
+
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
23 |
+
Q[action] = expected_value
|
24 |
+
if return_value:
|
25 |
+
return np.argmax(Q), np.max(Q)
|
26 |
+
else:
|
27 |
+
return Q
|
28 |
+
|
29 |
+
def train(self):
|
30 |
+
i = 0
|
31 |
+
while True:
|
32 |
+
delta = 0
|
33 |
+
V_prev = np.copy(self.V)
|
34 |
+
for state in range(self.env.observation_space.n):
|
35 |
+
action, value = self.policy(state, return_value=True)
|
36 |
+
self.V[state] = value
|
37 |
+
delta = max(delta, abs(V_prev[state] - self.V[state]))
|
38 |
+
if delta < self.theta:
|
39 |
+
break
|
40 |
+
i += 1
|
41 |
+
print(f"Iteration {i}: delta={delta}")
|
42 |
+
# break
|
43 |
+
|
44 |
+
policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
|
45 |
+
return self.V, policy
|
46 |
+
|
47 |
+
def save_policy(self, pth):
|
48 |
+
np.save(pth, self.V)
|
49 |
+
|
50 |
+
def load_policy(self, pth):
|
51 |
+
"""
|
52 |
+
not really loading the 'policy', but the state-value function but for
|
53 |
+
interface's sake, here we are.
|
54 |
+
"""
|
55 |
+
self.V = np.load(pth)
|
56 |
+
|
57 |
+
def generate_episode(self, max_steps, render=False, **kwargs):
|
58 |
+
state, _ = self.env.reset()
|
59 |
+
episode_hist, solved, rgb_array = [], False, None
|
60 |
+
|
61 |
+
# Generate an episode following the current policy
|
62 |
+
for _ in range(max_steps):
|
63 |
+
rgb_array = self.env.render() if render else None
|
64 |
+
# Sample an action from the policy
|
65 |
+
action = self.policy(state)
|
66 |
+
maction = np.argmax(action)
|
67 |
+
# Take the action and observe the reward and next state
|
68 |
+
next_state, reward, done, truncated, _ = self.env.step(maction)
|
69 |
+
# Keeping track of the trajectory
|
70 |
+
episode_hist.append((state, maction, reward))
|
71 |
+
state = next_state
|
72 |
+
|
73 |
+
yield episode_hist, solved, rgb_array
|
74 |
+
|
75 |
+
# This is where the agent got to the goal.
|
76 |
+
# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
|
77 |
+
if done or truncated:
|
78 |
+
solved = True
|
79 |
+
break
|
80 |
+
|
81 |
+
rgb_array = self.env.render() if render else None
|
82 |
+
|
83 |
+
yield episode_hist, solved, rgb_array
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
env = gym.make('CliffWalking-v0', render_mode='human')
|
88 |
+
dp = DP(env)
|
89 |
+
dp.train()
|
90 |
+
dp.save_policy('dp_policy.npy')
|
91 |
+
|
92 |
+
state, _ = env.reset()
|
93 |
+
done = False
|
94 |
+
while not done:
|
95 |
+
action = dp.policy(state)
|
96 |
+
state, reward, done, _, _ = env.step(action)
|
97 |
+
env.render()
|
98 |
+
|
99 |
+
# plt.savefig(f"imgs/{0}.png")
|
demo.py
CHANGED
@@ -3,6 +3,7 @@ import time
|
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
5 |
from MonteCarloAgent import MonteCarloAgent
|
|
|
6 |
import scipy.ndimage
|
7 |
import cv2
|
8 |
|
@@ -28,7 +29,7 @@ except FileNotFoundError:
|
|
28 |
# All supported agents
|
29 |
agent_map = {
|
30 |
"MonteCarloAgent": MonteCarloAgent,
|
31 |
-
|
32 |
}
|
33 |
action_map = {
|
34 |
"CliffWalking-v0": {
|
@@ -161,7 +162,9 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
|
|
161 |
episode_hist[-2] if len(episode_hist) > 1 else (None, None, None)
|
162 |
)
|
163 |
state, action, reward = episode_hist[-1]
|
164 |
-
curr_policy = agent.
|
|
|
|
|
165 |
|
166 |
# frame_env = cv2.resize(
|
167 |
# frame_env,
|
|
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
5 |
from MonteCarloAgent import MonteCarloAgent
|
6 |
+
from DPAgent import DP
|
7 |
import scipy.ndimage
|
8 |
import cv2
|
9 |
|
|
|
29 |
# All supported agents
|
30 |
agent_map = {
|
31 |
"MonteCarloAgent": MonteCarloAgent,
|
32 |
+
"DPAgent": DP
|
33 |
}
|
34 |
action_map = {
|
35 |
"CliffWalking-v0": {
|
|
|
162 |
episode_hist[-2] if len(episode_hist) > 1 else (None, None, None)
|
163 |
)
|
164 |
state, action, reward = episode_hist[-1]
|
165 |
+
curr_policy = agent.policy(state)
|
166 |
+
curr_policy -= np.min(curr_policy)
|
167 |
+
curr_policy = curr_policy / np.sum(curr_policy)
|
168 |
|
169 |
# frame_env = cv2.resize(
|
170 |
# frame_env,
|
policies/DPAgent_CliffWalking-v0_i219_g0.9.npy
ADDED
Binary file (512 Bytes). View file
|
|