Spaces:
Sleeping
Sleeping
add exps
Browse files- DPAgent.py +49 -27
- creategif.py +21 -14
- qtable_policy.gif +0 -0
- scripts/dp_exp.py +52 -0
- scripts/speedtest.py +83 -0
DPAgent.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
|
|
1 |
import gymnasium as gym
|
2 |
import numpy as np
|
3 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
4 |
from matplotlib import pyplot as plt
|
|
|
5 |
from tqdm import trange
|
|
|
6 |
from AgentBase import AgentBase
|
7 |
-
import warnings
|
8 |
|
9 |
|
10 |
class DPAgent(AgentBase):
|
@@ -23,6 +26,7 @@ class DPAgent(AgentBase):
|
|
23 |
return self.Pi[state]
|
24 |
|
25 |
def train(self, *args, **kwargs):
|
|
|
26 |
i = 0
|
27 |
print(self.gamma)
|
28 |
while True:
|
@@ -50,13 +54,35 @@ class DPAgent(AgentBase):
|
|
50 |
# update the state-value function
|
51 |
self.V[state] = value
|
52 |
delta = max(delta, abs(V_prev[state] - self.V[state]))
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
break
|
55 |
i += 1
|
56 |
-
# if i % 5 == 0 and i != 0:
|
57 |
-
# self.test(verbose=False)
|
58 |
print(f"Iteration {i}: delta={delta}")
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
61 |
for s in range(self.env.observation_space.n):
|
62 |
for a in range(self.env.action_space.n):
|
@@ -64,7 +90,7 @@ class DPAgent(AgentBase):
|
|
64 |
for probability, next_state, reward, done in self.env.P[s][a]:
|
65 |
if (
|
66 |
self.env_name == "CliffWalking-v0"
|
67 |
-
and
|
68 |
):
|
69 |
reward = 1
|
70 |
expected_value += probability * (
|
@@ -72,39 +98,35 @@ class DPAgent(AgentBase):
|
|
72 |
)
|
73 |
self.Pi[s, a] = expected_value
|
74 |
idxs = np.argmax(self.Pi, axis=1)
|
75 |
-
print(idxs)
|
76 |
self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
|
77 |
self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
|
78 |
-
# print(self.Pi)
|
79 |
-
# return self.V, self.Pi
|
80 |
|
81 |
|
82 |
if __name__ == "__main__":
|
83 |
-
# env = gym.make('FrozenLake-v1', render_mode='human')
|
84 |
-
dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
|
85 |
-
dp.train()
|
86 |
-
dp.save_policy("dp_policy.npy")
|
87 |
env = gym.make(
|
88 |
"FrozenLake-v1",
|
89 |
-
render_mode="
|
|
|
90 |
is_slippery=False,
|
91 |
-
desc=[
|
92 |
-
"SFFFFFFF",
|
93 |
-
"FFFFFFFH",
|
94 |
-
"FFFHFFFF",
|
95 |
-
"FFFFFHFF",
|
96 |
-
"FFFHFFFF",
|
97 |
-
"FHHFFFHF",
|
98 |
-
"FHFFHFHF",
|
99 |
-
"FFFHFFFG",
|
100 |
-
],
|
101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
state, _ = env.reset()
|
104 |
done = False
|
105 |
while not done:
|
106 |
-
action = dp.choose_action(state)
|
107 |
state, reward, done, _, _ = env.step(action)
|
108 |
-
env.render()
|
109 |
-
|
110 |
-
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
import gymnasium as gym
|
4 |
import numpy as np
|
5 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
6 |
from matplotlib import pyplot as plt
|
7 |
+
from PIL import Image
|
8 |
from tqdm import trange
|
9 |
+
|
10 |
from AgentBase import AgentBase
|
|
|
11 |
|
12 |
|
13 |
class DPAgent(AgentBase):
|
|
|
26 |
return self.Pi[state]
|
27 |
|
28 |
def train(self, *args, **kwargs):
|
29 |
+
success_rate = []
|
30 |
i = 0
|
31 |
print(self.gamma)
|
32 |
while True:
|
|
|
54 |
# update the state-value function
|
55 |
self.V[state] = value
|
56 |
delta = max(delta, abs(V_prev[state] - self.V[state]))
|
57 |
+
self.make_pi()
|
58 |
+
suc = self.test(verbose=False, greedy=True)
|
59 |
+
success_rate.append(suc)
|
60 |
+
if delta < self.theta and self.theta < 1:
|
61 |
+
print(f"breaking at {delta}, {self.theta}")
|
62 |
+
break
|
63 |
+
elif i > self.theta and self.theta > 1:
|
64 |
+
print(f"breaking at {i}, {self.theta}")
|
65 |
break
|
66 |
i += 1
|
|
|
|
|
67 |
print(f"Iteration {i}: delta={delta}")
|
68 |
|
69 |
+
# self.write_v(0)
|
70 |
+
return success_rate
|
71 |
+
|
72 |
+
def write_v(self, i):
|
73 |
+
v_cop = np.copy(self.V).reshape((12, 4))
|
74 |
+
print(v_cop)
|
75 |
+
v_cop -= np.min(v_cop)
|
76 |
+
v_cop /= np.max(v_cop)
|
77 |
+
print(np.min(v_cop), np.max(v_cop))
|
78 |
+
img = Image.fromarray(np.uint8(v_cop * 255), "L")
|
79 |
+
img = img.resize(
|
80 |
+
(v_cop.shape[0] * 100, v_cop.shape[1] * 100),
|
81 |
+
resample=Image.Resampling.NEAREST,
|
82 |
+
)
|
83 |
+
img.save(f"imgs/{i}.png")
|
84 |
+
|
85 |
+
def make_pi(self):
|
86 |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
87 |
for s in range(self.env.observation_space.n):
|
88 |
for a in range(self.env.action_space.n):
|
|
|
90 |
for probability, next_state, reward, done in self.env.P[s][a]:
|
91 |
if (
|
92 |
self.env_name == "CliffWalking-v0"
|
93 |
+
and s == self.env.observation_space.n - 1
|
94 |
):
|
95 |
reward = 1
|
96 |
expected_value += probability * (
|
|
|
98 |
)
|
99 |
self.Pi[s, a] = expected_value
|
100 |
idxs = np.argmax(self.Pi, axis=1)
|
|
|
101 |
self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
|
102 |
self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
|
|
|
|
|
103 |
|
104 |
|
105 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
106 |
env = gym.make(
|
107 |
"FrozenLake-v1",
|
108 |
+
render_mode="ansi",
|
109 |
+
desc=generate_random_map(8, seed=24),
|
110 |
is_slippery=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
)
|
112 |
+
dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
|
113 |
+
dp.env = env
|
114 |
+
dp.env_name = "FrozenLake-v1"
|
115 |
+
dp.V = np.zeros(dp.env.observation_space.n)
|
116 |
+
dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
|
117 |
+
dp.n_states, dp.n_actions = (
|
118 |
+
dp.env.observation_space.n,
|
119 |
+
dp.env.action_space.n,
|
120 |
+
)
|
121 |
+
dp.train()
|
122 |
+
|
123 |
+
print(dp.test())
|
124 |
|
125 |
state, _ = env.reset()
|
126 |
done = False
|
127 |
while not done:
|
128 |
+
action = dp.choose_action(state, greedy=True)
|
129 |
state, reward, done, _, _ = env.step(action)
|
130 |
+
s = env.render()
|
131 |
+
print(s)
|
132 |
+
plt.savefig(f"imgs/{0}.png")
|
creategif.py
CHANGED
@@ -5,21 +5,29 @@ from PIL import Image
|
|
5 |
from PIL.Image import Transpose, Resampling
|
6 |
|
7 |
|
8 |
-
api = wandb.Api()
|
9 |
-
run = api.run("acozma/cs581/5ttfkav8")
|
10 |
|
11 |
-
print(run.summary)
|
12 |
-
print("Downloading images...")
|
13 |
|
14 |
-
for file in run.files():
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
print("Finished downloading images")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
def process_images(image_fnames, upscale=20):
|
22 |
-
image_fnames
|
|
|
23 |
frames = [Image.open(image) for image in image_fnames]
|
24 |
frames = [frame.transpose(Transpose.ROTATE_90) for frame in frames]
|
25 |
frames = [
|
@@ -46,14 +54,13 @@ def images_to_gif(frames, fname, duration=500):
|
|
46 |
)
|
47 |
|
48 |
|
49 |
-
folder_path = "./media/images"
|
50 |
all_fnames = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
|
|
|
51 |
|
52 |
-
|
53 |
-
fnames_policy = [f for f in all_fnames if os.path.basename(f).startswith("Policy")]
|
54 |
policy_frames = process_images(fnames_policy)
|
55 |
|
56 |
-
fnames_qtable = [f for f in all_fnames if os.path.basename(f).startswith(
|
57 |
qtable_frames = process_images(fnames_qtable)
|
58 |
|
59 |
spacing_factor = 1 / 2
|
@@ -67,4 +74,4 @@ for i, (qtable, policy) in enumerate(zip(qtable_frames, policy_frames)):
|
|
67 |
new_frame.paste(policy, (0, height + int(height * spacing_factor)))
|
68 |
final_frames.append(new_frame)
|
69 |
|
70 |
-
images_to_gif(final_frames,
|
|
|
5 |
from PIL.Image import Transpose, Resampling
|
6 |
|
7 |
|
8 |
+
# api = wandb.Api()
|
9 |
+
# run = api.run("acozma/cs581/5ttfkav8")
|
10 |
|
11 |
+
# print(run.summary)
|
12 |
+
# print("Downloading images...")
|
13 |
|
14 |
+
# for file in run.files():
|
15 |
+
# if file.name.endswith(".png"):
|
16 |
+
# file.download(exist_ok=True)
|
17 |
|
18 |
+
# print("Finished downloading images")
|
19 |
+
|
20 |
+
|
21 |
+
folder_path = "./imgs/"
|
22 |
+
policy_file_prefix = "Pi"
|
23 |
+
q_file_prefix = "Q"
|
24 |
+
out_name = "qtable_policy"
|
25 |
+
sort_lambda = lambda x: int(x.split("_")[1].split(".")[0]) # key used to sort image filenames
|
26 |
|
27 |
|
28 |
def process_images(image_fnames, upscale=20):
|
29 |
+
print(image_fnames)
|
30 |
+
image_fnames.sort(key=sort_lambda)
|
31 |
frames = [Image.open(image) for image in image_fnames]
|
32 |
frames = [frame.transpose(Transpose.ROTATE_90) for frame in frames]
|
33 |
frames = [
|
|
|
54 |
)
|
55 |
|
56 |
|
|
|
57 |
all_fnames = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
|
58 |
+
print(all_fnames)
|
59 |
|
60 |
+
fnames_policy = [f for f in all_fnames if os.path.basename(f).startswith(policy_file_prefix)))]
|
|
|
61 |
policy_frames = process_images(fnames_policy)
|
62 |
|
63 |
+
fnames_qtable = [f for f in all_fnames if os.path.basename(f).startswith(q_file_prefix)]
|
64 |
qtable_frames = process_images(fnames_qtable)
|
65 |
|
66 |
spacing_factor = 1 / 2
|
|
|
74 |
new_frame.paste(policy, (0, height + int(height * spacing_factor)))
|
75 |
final_frames.append(new_frame)
|
76 |
|
77 |
+
images_to_gif(final_frames, out_name)
|
qtable_policy.gif
DELETED
Binary file (583 kB)
|
|
scripts/dp_exp.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import time
|
3 |
+
|
4 |
+
import gymnasium as gym
|
5 |
+
import numpy as np
|
6 |
+
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
7 |
+
|
8 |
+
import wandb
|
9 |
+
from DPAgent import DPAgent
|
10 |
+
from MCAgent import MCAgent
|
11 |
+
|
12 |
+
env_ver = "FrozenLake-v1"
|
13 |
+
|
14 |
+
|
15 |
+
def test_dp(gamma=0.99):
|
16 |
+
env = gym.make(
|
17 |
+
env_ver,
|
18 |
+
render_mode="ansi",
|
19 |
+
# desc=generate_random_map(8, seed=3141),
|
20 |
+
# is_slippery=False,
|
21 |
+
)
|
22 |
+
dp = DPAgent(env=env_ver, gamma=0.99)
|
23 |
+
dp.env = env
|
24 |
+
dp.env_name = env_ver
|
25 |
+
dp.V = np.zeros(dp.env.observation_space.n)
|
26 |
+
dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
|
27 |
+
dp.n_states, dp.n_actions = (
|
28 |
+
dp.env.observation_space.n,
|
29 |
+
dp.env.action_space.n,
|
30 |
+
)
|
31 |
+
times = dp.train()
|
32 |
+
|
33 |
+
# np.save(f"times_{gamma}.npy", times)
|
34 |
+
s = env.render()
|
35 |
+
print(s)
|
36 |
+
|
37 |
+
|
38 |
+
def main():
|
39 |
+
wandb.init(
|
40 |
+
project="cs581",
|
41 |
+
# job_type=args.wandb_job_type,
|
42 |
+
# config=dict(args._get_kwargs()),
|
43 |
+
)
|
44 |
+
np.set_printoptions(linewidth=500, precision=3)
|
45 |
+
# with multiprocessing.Pool(8) as p:
|
46 |
+
# gamma = [0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.1]
|
47 |
+
# p.map(test_dp, gamma)
|
48 |
+
test_dp(0.99)
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main()
|
scripts/speedtest.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import time
|
3 |
+
|
4 |
+
import gymnasium as gym
|
5 |
+
import numpy as np
|
6 |
+
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
7 |
+
|
8 |
+
from DPAgent import DPAgent
|
9 |
+
from MCAgent import MCAgent
|
10 |
+
|
11 |
+
env_ver = "FrozenLake-v1"
|
12 |
+
|
13 |
+
|
14 |
+
def test_mc(i, seed):
|
15 |
+
env = gym.make(
|
16 |
+
env_ver, desc=generate_random_map(size=i, p=0.4, seed=seed), is_slippery=False
|
17 |
+
)
|
18 |
+
agent = MCAgent(env=env_ver, gamma=1.0, epsilon=0.4)
|
19 |
+
agent.env = env
|
20 |
+
agent.n_states, agent.n_actions = (
|
21 |
+
agent.env.observation_space.n,
|
22 |
+
agent.env.action_space.n,
|
23 |
+
)
|
24 |
+
agent.initialize()
|
25 |
+
tic = time.perf_counter()
|
26 |
+
trained = agent.train(
|
27 |
+
max_steps=int((i**2) * 3),
|
28 |
+
n_train_episodes=10_000,
|
29 |
+
save_best=False,
|
30 |
+
early_stopping=True,
|
31 |
+
update_type="every_visit",
|
32 |
+
)
|
33 |
+
toc = time.perf_counter()
|
34 |
+
return trained, toc - tic
|
35 |
+
|
36 |
+
|
37 |
+
def test_dp(i, seed, gamma=0.99):
|
38 |
+
env = gym.make(env_ver, desc=generate_random_map(i, seed=seed), is_slippery=False)
|
39 |
+
agent = DPAgent(env=env_ver, gamma=gamma)
|
40 |
+
agent.env = env
|
41 |
+
agent.V = np.zeros(agent.env.observation_space.n)
|
42 |
+
agent.Pi = np.zeros(agent.env.observation_space.n, agent.env.action_space.n)
|
43 |
+
agent.n_states, agent.n_actions = (
|
44 |
+
agent.env.observation_space.n,
|
45 |
+
agent.env.action_space.n,
|
46 |
+
)
|
47 |
+
|
48 |
+
return agent.train()
|
49 |
+
|
50 |
+
|
51 |
+
def run_test(i):
|
52 |
+
mc_trained = False
|
53 |
+
seed = 0
|
54 |
+
mc_time = 0
|
55 |
+
dp_time = 0
|
56 |
+
while not mc_trained:
|
57 |
+
seed = np.random.randint(0, 100000)
|
58 |
+
mc_trained, train_time = test_mc(i, seed)
|
59 |
+
mc_time = train_time
|
60 |
+
dp_time = test_dp(i, seed)
|
61 |
+
|
62 |
+
return mc_time, dp_time
|
63 |
+
|
64 |
+
|
65 |
+
def run_exp(gamma):
|
66 |
+
times = []
|
67 |
+
for i in range(8, 512, 8):
|
68 |
+
# mc_time, dp_time = run_test(i)
|
69 |
+
dp_time = test_dp(i, 0, gamma=gamma)
|
70 |
+
times.append((i, dp_time))
|
71 |
+
times = np.array(times)
|
72 |
+
np.save(f"times_{gamma}.npy", times)
|
73 |
+
return
|
74 |
+
|
75 |
+
|
76 |
+
def main():
|
77 |
+
with multiprocessing.Pool(8) as p:
|
78 |
+
gamma = [0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.1]
|
79 |
+
p.map(run_exp, gamma)
|
80 |
+
|
81 |
+
|
82 |
+
if __name__ == "__main__":
|
83 |
+
main()
|